def process_qnode(self, kw: KgtkWriter, current_process_node_id: str, each_node_attributes: EACH_NODE_ATTRIBUTES) -> bool: interesting_qnode: bool = False if each_node_attributes: for k in each_node_attributes: if each_node_attributes[k]: interesting_qnode = True break if not interesting_qnode: return False concat_sentence: str explanation: str concat_sentence, explanation = self.attribute_to_sentence( each_node_attributes, current_process_node_id) if self.explain: kw.write([ current_process_node_id, self.sentence_label, KgtkFormat.stringify(concat_sentence), KgtkFormat.stringify(explanation) ]) else: kw.write([ current_process_node_id, self.sentence_label, KgtkFormat.stringify(concat_sentence) ]) return True
def write_row(self, ew: KgtkWriter, node1: str, label: str, node2: str): output_row: typing.List[str] = [node1, label, node2] if self.idbuilder is None: ew.write(output_row) else: ew.write(self.idbuilder.build(output_row, self.output_line_count)) self.output_line_count += 1
def single_object_filter_inverted( kr: KgtkReader, kw: KgtkWriter, rw: typing.Optional[KgtkWriter], obj_idx: int, obj_filter: typing.Set[str], ): if verbose: print("Applying a single object filter inverted", file=error_file, flush=True) obj_filter_value: str = list(obj_filter)[0] input_line_count: int = 0 reject_line_count: int = 0 output_line_count: int = 0 row: typing.List[str] for row in kr: input_line_count += 1 if row[obj_idx] != obj_filter_value: kw.write(row) output_line_count += 1 else: if rw is not None: rw.write(row) reject_line_count += 1 if verbose: print("Read %d rows, rejected %d rows, wrote %d rows." % (input_line_count, reject_line_count, output_line_count))
def single_predicate_filter( kr: KgtkReader, kw: KgtkWriter, rw: typing.Optional[KgtkWriter], pred_idx: int, pred_filter: typing.Set[str], ): if verbose: print("Applying a single predicate filter", file=error_file, flush=True) pred_filter_value: str = list(pred_filter)[0] input_line_count: int = 0 reject_line_count: int = 0 output_line_count: int = 0 row: typing.List[str] for row in kr: input_line_count += 1 if row[pred_idx] == pred_filter_value: kw.write(row) output_line_count += 1 else: if rw is not None: rw.write(row) reject_line_count += 1 if verbose: print("Read %d rows, rejected %d rows, wrote %d rows." % (input_line_count, reject_line_count, output_line_count))
def process_row(self, input_key: str, row: typing.List[str], line_number: int, idb: typing.Optional[KgtkIdBuilder], ew: KgtkWriter, flush: bool = False): # Note: This code makes the assumption that row lengths do not vary! if self.current_key is not None: # We have a record being built. Write it? if flush or self.current_key != input_key: # self.current_key != input_key means that the key is changing. self.compact_row() if self.current_row is not None: if idb is None: ew.write(self.current_row) else: ew.write(idb.build(self.current_row, line_number)) self.output_line_count += 1 self.current_key = None self.current_row = None if flush: # This was a flush request. We're done. return # Are we starting a new key? if self.current_key is None: # Save the new row. self.current_key = input_key self.expand_row(row) else: # Merge into an existing row. self.merge_row(row)
def process_cacheing_filter(self, input_kr: KgtkReader, filter_kr: KgtkReader, input_key_columns: typing.List[int], filter_key_columns: typing.List[int], ew: KgtkWriter): if self.verbose: print("Processing by cacheing the filter file's key set..") if self.verbose: print("Building the filter key set from %s" % self.filter_file_path, file=self.error_file, flush=True) key_set: typing.Set[str] = self.extract_key_set( filter_kr, "filter", filter_key_columns) if self.verbose or self.very_verbose: print("There are %d entries in the filter key set." % len(key_set), file=self.error_file, flush=True) if self.very_verbose: print("Keys: %s" % " ".join(key_set), file=self.error_file, flush=True) if self.verbose: print("Filtering records from %s" % self.input_file_path, file=self.error_file, flush=True) input_line_count: int = 0 output_line_count: int = 0 # TODO: join these two code paths using xor? row: typing.List[str] input_key: str if self.invert: for row in input_kr: input_line_count += 1 input_key = self.build_key(row, input_key_columns) if input_key not in key_set: ew.write(row) output_line_count += 1 else: for row in input_kr: input_line_count += 1 input_key = self.build_key(row, input_key_columns) if input_key in key_set: ew.write(row) output_line_count += 1 if self.verbose: print("Read %d records, wrote %d records." % (input_line_count, output_line_count), file=self.error_file, flush=True)
def process_row(self, input_key: str, row: typing.List[str], line_number: int, idb: typing.Optional[KgtkIdBuilder], ew: KgtkWriter, flush: bool = False): if self.very_verbose: print("Input key %s" % repr(input_key), file=self.error_file, flush=True) # Note: This code makes the assumption that row lengths do not vary! if self.current_key is not None: if self.very_verbose: print("No current key", file=self.error_file, flush=True) # We have a record being built. Write it? if flush or self.current_key != input_key: if self.very_verbose: if flush: print("flush", file=self.error_file, flush=True) else: print("current_key %s != input_key %s" % (repr(self.current_key), repr(input_key)), file=self.error_file, flush=True) # self.current_key != input_key means that the key is changing. self.compact_row() if self.current_row is not None: if self.very_verbose: print("writing %s" % repr(self.field_separator.join(self.current_row)), file=self.error_file, flush=True) if idb is None: ew.write(self.current_row) else: ew.write(idb.build(self.current_row, line_number)) self.output_line_count += 1 self.current_key = None self.current_row = None if flush: # This was a flush request. We're done. return # Are we starting a new key? if self.current_key is None: # Save the new row. if self.very_verbose: print("New current_key %s" % repr(self.current_key), file=self.error_file, flush=True) self.current_key = input_key if self.very_verbose: print("Expand row %s" % self.field_separator.join(row), file=self.error_file, flush=True) self.expand_row(row) else: # Merge into an existing row. if self.very_verbose: print("Merge row", file=self.error_file, flush=True) self.merge_row(row)
def generate_kgtk_output(entities_output,output_kgtk_file,output_no_header,verbose,very_verbose): # Open the output file. kw: KgtkWriter = KgtkWriter.open(#kr.column_names, ['node1', 'label', 'node2'], output_kgtk_file, #mode=KgtkWriter.Mode[kr.mode.name], mode = KgtkWriter.Mode.AUTO, require_all_columns=False, prohibit_extra_columns=False, fill_missing_columns=False, gzip_in_parallel=False, no_header=output_no_header, verbose=verbose, very_verbose=very_verbose) input_line_count: int = 0 if verbose: logging.info("Processing the input records.", file=self.error_file, flush=True) MODULE_NAME = 'graph_embeddings' # __name__.split('.')[-1] with open(entities_output) as wv_file: for line in wv_file: line = line.replace('\n','') #remove \n entity_name = line.split('\t')[0] entity_vev = ','.join(line.split('\t')[1:]) input_line_count += 1 kw.write([entity_name,MODULE_NAME,entity_vev]) if verbose: logging.info("Processed %d records." % (input_line_count), file=self.error_file, flush=True) kw.close()
def write_updated_namespace_file(self): # Is there an updated namespaces file? if self.updated_namespace_file_path is None: return if self.verbose: print("Opening updated namespaces file %s" % str(self.updated_namespace_file_path), file=self.error_file, flush=True) # Open the updated namespaces file. un: KgtkWriter = KgtkWriter.open(self.COLUMN_NAMES, self.updated_namespace_file_path, mode=KgtkWriter.Mode.EDGE, require_all_columns=True, prohibit_extra_columns=True, fill_missing_columns=False, gzip_in_parallel=False, verbose=self.verbose, very_verbose=self.very_verbose) namespace_id: str for namespace_id in sorted(self.namespace_ids.keys()): un.write([ namespace_id, self.prefix_expansion_label, '"' + self.namespace_ids[namespace_id] + '"' ]) un.close()
def process(self): # Open the input file. if self.verbose: print("Opening the input file: %s" % str(self.input_file_path), file=self.error_file, flush=True) kr: KgtkReader = KgtkReader.open( self.input_file_path, error_file=self.error_file, options=self.reader_options, value_options=self.value_options, verbose=self.verbose, very_verbose=self.very_verbose, ) if self.verbose: print("Opening the output file: %s" % str(self.output_file_path), file=self.error_file, flush=True) # Open the output file. kw: KgtkWriter = KgtkWriter.open(kr.column_names, self.output_file_path, mode=KgtkWriter.Mode[kr.mode.name], require_all_columns=False, prohibit_extra_columns=True, fill_missing_columns=False, gzip_in_parallel=False, verbose=self.verbose, very_verbose=self.very_verbose) # here kw has one line already where PBG doesn't need it, input_line_count: int = 0 if self.verbose: print("Processing the input records.", file=self.error_file, flush=True) # node1 relation node2 node1_index = kr.get_node1_column_index() node2_index = kr.get_node2_column_index() relation_index = kr.get_id_column_index('relation') row: typing.List[str] # delete header kw.file_out.seek(0) # set the cursor to the top of the file kw.file_out.truncate() # truncate following part == delete first line # print(kw.file_out.tell()) for row in kr: input_line_count += 1 kw.write([row[node1_index], row[relation_index], row[node2_index]]) if self.verbose: print("Processed %d records." % (input_line_count), file=self.error_file, flush=True) kw.close()
def write_output_row( self, ew: KgtkWriter, row: typing.List[str], new_columns: int, input_select_column_idx: int, label_select_column_idx: int, labels: typing.Mapping[str, str], lifted_column_idxs: typing.List[int], lifted_output_column_idxs: typing.List[int], ) -> bool: output_row: typing.List[str] = row.copy() if new_columns > 0: output_row.extend([""] * new_columns) output_select_column_idx: int = input_select_column_idx do_write: bool = True do_lift: bool = True if label_select_column_idx >= 0: print("label_select_column_idx %d" % label_select_column_idx) if row[label_select_column_idx] == self.label_select_column_value: # Don't lift label columns, if we have stored labels in the input records. do_lift = False if self.remove_label_records: do_write = False if input_select_column_idx >= 0: if self.input_select_column_value is not None and row[ input_select_column_idx] != self.input_select_column_value: # Not selected for lifting into. do_lift = False if do_lift: # Lift the specified columns in this row. did_lift: bool = False lifted_column_idx: int for idx, lifted_column_idx in enumerate(lifted_column_idxs): label_key: str = row[lifted_column_idx] if label_key in labels: output_row[ lifted_output_column_idxs[idx]] = labels[label_key] did_lift = True # What if we want to note if we lifted all columns? if did_lift and output_select_column_idx >= 0 and self.output_select_column_value is not None: output_row[ output_select_column_idx] = self.output_select_column_value if do_write: ew.write(output_row) return do_write
def write_new_edge( self, kw: KgtkWriter, unreifiedw: typing.Optional[KgtkWriter], potential_edge_attributes: typing.List[typing.List[str]], edge_id: str, rdf_subject_value: str, rdf_predicate_value: str, rdf_object_value: str, label_column_idx: int, node2_column_idx: int, node1_column_name: str, label_column_name: str, node2_column_name: str, id_column_name: str, ): kw.writemap({ node1_column_name: rdf_subject_value, label_column_name: rdf_predicate_value, node2_column_name: rdf_object_value, id_column_name: edge_id, }) self.output_line_count += 1 if unreifiedw is not None: unreifiedw.writemap({ node1_column_name: rdf_subject_value, label_column_name: rdf_predicate_value, node2_column_name: rdf_object_value, id_column_name: edge_id, }) self.write_edge_attributes( kw, unreifiedw, potential_edge_attributes, edge_id, label_column_idx, node2_column_idx, node1_column_name, label_column_name, node2_column_name, id_column_name, )
def write_new_edge( self, kw: KgtkWriter, unreifiedw: typing.Optional[KgtkWriter], potential_edge_attributes: typing.List[typing.List[str]], node1_value: str, node2_value: str, edge_id: str, label_column_idx: int, node2_column_idx: int, node1_column_name: str, label_column_name: str, node2_column_name: str, id_column_name: str, ): new_label_value: str = self.new_label_value if self.new_label_value is not None else self.value_label_value kw.writemap({ node1_column_name: node1_value, label_column_name: new_label_value, node2_column_name: node2_value, id_column_name: edge_id, }) self.output_line_count += 1 if unreifiedw is not None: unreifiedw.writemap({ node1_column_name: node1_value, label_column_name: new_label_value, node2_column_name: node2_value, id_column_name: edge_id, }) self.write_edge_attributes( kw, unreifiedw, potential_edge_attributes, edge_id, label_column_idx, node2_column_idx, node1_column_name, label_column_name, node2_column_name, id_column_name, )
def pass_group_through(self, kw: KgtkWriter, uninvolvedw: typing.Optional[KgtkWriter], node1_group: typing.List[typing.List[str]], new_id_column: bool): # Unreification was not triggered. Pass this group of rows # through unchanged, except for possibly appending an ID # column. # # TODO: Perhaps we'd like to build an ID value at the same time? row: typing.List[str] for row in node1_group: if uninvolvedw is not None: uninvolvedw.write(row) if new_id_column: row = row.copy() row.append("") kw.write(row) self.output_line_count += 1
def write_edge_attributes( self, kw: KgtkWriter, unreifiedw: typing.Optional[KgtkWriter], potential_edge_attributes: typing.List[typing.List[str]], edge_id: str, label_column_idx: int, node2_column_idx: int, node1_column_name: str, label_column_name: str, node2_column_name: str, id_column_name: str, ): width: int = self.get_width(len(potential_edge_attributes)) attribute_number: int = 0 edge_row: typing.List[str] for edge_row in potential_edge_attributes: attribute_number += 1 attr_edge_id: str = self.make_new_id(edge_id, attribute_number, width) kw.writemap({ node1_column_name: edge_id, label_column_name: edge_row[label_column_idx], node2_column_name: edge_row[node2_column_idx], id_column_name: attr_edge_id }) self.output_line_count += 1 if unreifiedw is not None: unreifiedw.writemap({ node1_column_name: edge_id, label_column_name: edge_row[label_column_idx], node2_column_name: edge_row[node2_column_idx], id_column_name: attr_edge_id })
def process_row(self, input_key: str, row: typing.List[str], line_number: int, idb: typing.Optional[KgtkIdBuilder], ew: KgtkWriter, flush: bool = False): # Note: This code makes the assumption that row lengths do not vary! if self.current_key is not None: # We have a record being built. Write it? if flush or self.current_key != input_key: # self.current_key != input_key means that the key is changing. self.compact_row() if self.current_row is not None: if idb is None: ew.write(self.current_row) else: ew.write(idb.build(self.current_row, line_number)) self.current_key = None self.current_row = None if flush: # This was a flush request. We're done. return # Are we starting a new key? if self.current_key is None: # Save the new row as the current row. If the next row # doesn't have the same input key, we'll write this # row out with a minimum of handling. self.current_key = input_key self.current_row = row return if self.current_row_lists is None: self.expand_row() self.merge_row(row)
def write_output_row(self, ew: KgtkWriter, row: typing.List[str], new_columns: int, label_column_idx: int, labels: typing.Mapping[str, str], lifted_column_idxs: typing.List[int], lifted_output_column_idxs: typing.List[int]): output_row: typing.List[str] = row.copy() if new_columns > 0: output_row.extend([""] * new_columns) if label_column_idx >= 0 and row[ label_column_idx] == self.label_column_value: # Don't lift label columns, if we have stored labels in the input records. pass else: # Lift the specified columns in this row. lifted_column_idx: int for idx, lifted_column_idx in enumerate(lifted_column_idxs): lifted_value: str = row[lifted_column_idx] if lifted_value in labels: output_row[lifted_output_column_idxs[idx]] = labels[ row[lifted_column_idx]] ew.write(output_row) return
def process(self): input_kr: KgtkReader = KgtkReader.open( self.input_file_path, error_file=self.error_file, who="input", options=self.input_reader_options, value_options=self.value_options, verbose=self.verbose, very_verbose=self.very_verbose, ) input_key_columns: typing.List[int] = self.get_key_columns( input_kr, "input") label_col_idx = input_key_columns[1] label = '{}{}'.format('c', label_col_idx) g = load_graph_from_csv(str(input_kr.file_path), not (self.undirected), skip_first=not (self.no_header), hashed=True, csv_options={'delimiter': '\t'}, ecols=(input_key_columns[0], input_key_columns[2])) es = [] header = ['node1', 'label', 'node2'] if self.properties: properties = self.properties.split(',') for e in properties: es += (find_edge(g, g.edge_properties[label], e)) g.clear_edges() g.add_edge_list(list(set(es))) comp, hist = label_components(g, directed=self.strong) ew: KgtkWriter = KgtkWriter.open(header, self.output_file_path, mode=input_kr.mode, require_all_columns=False, prohibit_extra_columns=True, fill_missing_columns=True, gzip_in_parallel=False, verbose=self.verbose, very_verbose=self.very_verbose) for v, c in enumerate(comp): ew.write([ g.vertex_properties['name'][v], 'connected_component', str(c) ])
def write_files(error_file, file_number, file_prefix, kr, lines_to_write, output_path, Qnode, reader_options, split_by_qnode, suffix): if split_by_qnode: output_kgtk_file = Path(f'{output_path}/{Qnode}{suffix}') else: output_kgtk_file = Path(f'{output_path}/{file_prefix}{file_number}{suffix}') kw = KgtkWriter.open(kr.column_names, output_kgtk_file, mode=KgtkWriter.Mode[kr.mode.name], use_mgzip=reader_options.use_mgzip, # Hack! mgzip_threads=reader_options.mgzip_threads, # Hack! error_file=error_file, verbose=False, very_verbose=False) for r in lines_to_write: kw.write(r) kw.close()
def generate_kgtk_output(entities_output, output_kgtk_file, verbose, very_verbose): # Open the output file. kw: KgtkWriter = KgtkWriter.open( #kr.column_names, ['id', 'node1', 'node2', 'relation' ], # in order to obey the kgtk rules output_kgtk_file, #mode=KgtkWriter.Mode[kr.mode.name], mode=KgtkWriter.Mode.AUTO, require_all_columns=False, prohibit_extra_columns=False, fill_missing_columns=False, gzip_in_parallel=False, verbose=verbose, very_verbose=very_verbose) input_line_count: int = 0 if verbose: logging.info("Processing the input records.", file=self.error_file, flush=True) #delete header kw.file_out.seek(0) # set the cursor to the top of the file kw.file_out.truncate() # truncate following part == delete first line MODULE_NAME = 'graph_embeddings' # __name__.split('.')[-1] with open(entities_output) as wv_file: for line in wv_file: line = line.replace('\n', '') #remove \n entity_name = line.split('\t')[0] entity_vev = ','.join(line.split('\t')[1:]) input_line_count += 1 kw.write([entity_name, MODULE_NAME, entity_vev]) if verbose: logging.info("Processed %d records." % (input_line_count), file=self.error_file, flush=True) kw.close()
def process(self): # Open the input file. if self.verbose: print("Opening the input file: %s" % str(self.input_file_path), file=self.error_file, flush=True) kr: KgtkReader = KgtkReader.open(self.input_file_path, error_file=self.error_file, options=self.reader_options, value_options = self.value_options, verbose=self.verbose, very_verbose=self.very_verbose, ) if self.verbose: print("Opening the output file: %s" % str(self.output_file_path), file=self.error_file, flush=True) # Open the output file. kw: KgtkWriter = KgtkWriter.open(kr.column_names, self.output_file_path, mode=KgtkWriter.Mode[kr.mode.name], require_all_columns=True, prohibit_extra_columns=True, fill_missing_columns=False, gzip_in_parallel=False, verbose=self.verbose, very_verbose=self.very_verbose) input_line_count: int = 0 if self.verbose: print("Processing the input records.", file=self.error_file, flush=True) row: typing.List[str] for row in kr: input_line_count += 1 kw.write(row) if self.verbose: print("Processed %d records." % (input_line_count), file=self.error_file, flush=True) kw.close()
def open_output_writer( self, ikr: KgtkReader, lifted_column_idxs: typing.List[int] ) -> typing.Tuple[KgtkWriter, typing.List[int]]: # Build the output column names. output_column_names: typing.List[str] lifted_output_column_idxs: typing.List[int] output_column_names, lifted_output_column_idxs = self.build_output_column_names( ikr, lifted_column_idxs) if self.verbose: print("Opening the output file: %s" % self.output_file_path, file=self.error_file, flush=True) ew: KgtkWriter = KgtkWriter.open(output_column_names, self.output_file_path, mode=KgtkWriter.Mode[ikr.mode.name], require_all_columns=False, prohibit_extra_columns=True, fill_missing_columns=True, gzip_in_parallel=False, verbose=self.verbose, very_verbose=self.very_verbose) return ew, lifted_output_column_idxs
def process(self): UPDATE_VERSION: str = "2020-08-24T21:47:20.256050+00:00#mr0wtMHlN/QaplDsGc/ylG3Hw5stsjziykzuGlSHBSion4xoW/Bec0sn55IQ7wFWBUClRS7g1tbAuaqEduhUVA==" if self.show_version or self.verbose: print("KgtkIfEfexists version: %s" % UPDATE_VERSION, file=self.error_file, flush=True) # Open the input files once. if self.verbose: if self.input_file_path is not None: print("Opening the input file: %s" % self.input_file_path, file=self.error_file, flush=True) else: print("Reading the input data from stdin", file=self.error_file, flush=True) input_kr: KgtkReader = KgtkReader.open(self.input_file_path, error_file=self.error_file, who="input", options=self.input_reader_options, value_options = self.value_options, verbose=self.verbose, very_verbose=self.very_verbose, ) if self.verbose: print("Opening the filter input file: %s" % self.filter_file_path, file=self.error_file, flush=True) filter_kr: KgtkReader = KgtkReader.open(self.filter_file_path, who="filter", error_file=self.error_file, options=self.filter_reader_options, value_options=self.value_options, verbose=self.verbose, very_verbose=self.very_verbose, ) input_key_columns: typing.List[int] = self.get_key_columns(self.input_keys, input_kr, filter_kr, "input") filter_key_columns: typing.List[int] = self.get_key_columns(self.filter_keys, filter_kr, input_kr, "filter") if len(input_key_columns) != len(filter_key_columns): print("There are %d input key columns but %d filter key columns. Exiting." % (len(input_key_columns), len(filter_key_columns)), file=self.error_file, flush=True) return ew: typing.Optional[KgtkWriter] = None if self.output_file_path is not None: if self.verbose: print("Opening the output file: %s" % self.output_file_path, file=self.error_file, flush=True) ew = KgtkWriter.open(input_kr.column_names, self.output_file_path, mode=input_kr.mode, require_all_columns=False, prohibit_extra_columns=True, fill_missing_columns=True, gzip_in_parallel=False, verbose=self.verbose, very_verbose=self.very_verbose) rew: typing.Optional[KgtkWriter] = None if self.reject_file_path is not None: if self.verbose: print("Opening the reject file: %s" % self.reject_file_path, file=self.error_file, flush=True) rew = KgtkWriter.open(input_kr.column_names, self.reject_file_path, mode=input_kr.mode, require_all_columns=False, prohibit_extra_columns=True, fill_missing_columns=True, gzip_in_parallel=False, verbose=self.verbose, very_verbose=self.very_verbose) if self.cache_input: if self.preserve_order: self.process_cacheing_input_preserving_order(input_kr=input_kr, filter_kr=filter_kr, input_key_columns=input_key_columns, filter_key_columns=filter_key_columns, ew=ew, rew=rew) else: self.process_cacheing_input(input_kr=input_kr, filter_kr=filter_kr, input_key_columns=input_key_columns, filter_key_columns=filter_key_columns, ew=ew, rew=rew) else: self.process_cacheing_filter(input_kr=input_kr, filter_kr=filter_kr, input_key_columns=input_key_columns, filter_key_columns=filter_key_columns, ew=ew, rew=rew) if ew is not None: ew.close() if rew is not None: rew.close()
def run( input_file: KGTKFiles, path_file: KGTKFiles, output_file: KGTKFiles, statistics_only: bool, undirected: bool, max_hops: int, source_column_name: typing.Optional[str], target_column_name: typing.Optional[str], shortest_path: bool, errors_to_stdout: bool, errors_to_stderr: bool, show_options: bool, verbose: bool, very_verbose: bool, **kwargs, # Whatever KgtkFileOptions and KgtkValueOptions want. ): # import modules locally from pathlib import Path import sys from graph_tool.all import find_vertex from graph_tool.topology import all_paths from graph_tool.topology import all_shortest_paths from kgtk.gt.gt_load import load_graph_from_kgtk from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions from kgtk.io.kgtkwriter import KgtkWriter from kgtk.value.kgtkvalueoptions import KgtkValueOptions from kgtk.exceptions import KGTKException try: # Select where to send error messages, defaulting to stderr. error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr # Build the option structures. input_reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict( kwargs, who="input", fallback=True) path_reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict( kwargs, who="path", fallback=True) value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs) input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file) path_kgtk_file: Path = KGTKArgumentParser.get_input_file(path_file) output_kgtk_file: Path = KGTKArgumentParser.get_output_file( output_file) id_col = 'name' if verbose: print("Reading the path file: %s" % str(path_kgtk_file), file=error_file, flush=True) pairs = [] pkr: KgtkReader = KgtkReader.open( path_kgtk_file, error_file=error_file, options=path_reader_options, value_options=value_options, verbose=verbose, very_verbose=very_verbose, ) path_source_idx: int = pkr.get_node1_column_index(source_column_name) if path_source_idx < 0: print("Missing node1 (source) column name in the path file.", file=error_file, flush=True) path_target_idx: int = pkr.get_node2_column_index(target_column_name) if path_target_idx < 0: print("Missing node1 (target) column name in the path file.", file=error_file, flush=True) if path_source_idx < 0 or path_target_idx < 0: pkr.close() raise KGTKException("Exiting due to missing columns.") paths_read: int = 0 path_row: typing.List[str] for path_row in pkr: paths_read += 1 if len(path_row) != pkr.column_count: raise KGTKException( "Exiting because line %d in the path file (%s) is the wrong length: %d columns expected, %d were read." % (paths_read, str(path_kgtk_file), pkr.column_count, len(path_row))) src: str = path_row[path_source_idx] tgt: str = path_row[path_target_idx] pairs.append((src, tgt)) pkr.close() if verbose: print("%d path rows read" % paths_read, file=error_file, flush=True) if len(pairs) == 0: print("No path pairs found, the output will be empty.", file=error_file, flush=True) elif verbose: print("%d path pairs found" % len(pairs), file=error_file, flush=True) if verbose: print("Reading the input file: %s" % str(input_kgtk_file), file=error_file, flush=True) kr: KgtkReader = KgtkReader.open( input_kgtk_file, error_file=error_file, options=input_reader_options, value_options=value_options, verbose=verbose, very_verbose=very_verbose, ) sub_index: int = kr.get_node1_column_index() if sub_index < 0: print("Missing node1 (subject) column.", file=error_file, flush=True) pred_index: int = kr.get_label_column_index() if pred_index < 0: print("Missing label (predicate) column.", file=error_file, flush=True) obj_index: int = kr.get_node2_column_index() if obj_index < 0: print("Missing node2 (object) column", file=error_file, flush=True) id_index: int = kr.get_id_column_index() if id_index < 0: print("Missing id column", file=error_file, flush=True) if sub_index < 0 or pred_index < 0 or obj_index < 0 or id_index < 0: kr.close() raise KGTKException("Exiting due to missing columns.") predicate: str = kr.column_names[pred_index] id_col_name: str = kr.column_names[id_index] G = load_graph_from_kgtk(kr, directed=not undirected, ecols=(sub_index, obj_index), verbose=verbose, out=error_file) output_columns: typing.List[str] = ['node1', 'label', 'node2', 'id'] kw: KgtkWriter = KgtkWriter.open(output_columns, output_kgtk_file, mode=KgtkWriter.Mode.EDGE, require_all_columns=True, prohibit_extra_columns=True, fill_missing_columns=False, verbose=verbose, very_verbose=very_verbose) id_count = 0 if not statistics_only: for e in G.edges(): sid, oid = e lbl = G.ep[predicate][e] kw.write([ G.vp[id_col][sid], lbl, G.vp[id_col][oid], '{}-{}-{}'.format(G.vp[id_col][sid], lbl, id_count) ]) id_count += 1 if verbose: print("%d edges found." % id_count, file=error_file, flush=True) id_count = 0 path_id = 0 for pair in pairs: source_node, target_node = pair source_ids = find_vertex(G, prop=G.properties[('v', id_col)], match=source_node) target_ids = find_vertex(G, prop=G.properties[('v', id_col)], match=target_node) if len(source_ids) == 1 and len(target_ids) == 1: source_id = source_ids[0] target_id = target_ids[0] if shortest_path: _all_paths = all_shortest_paths(G, source_id, target_id, edges=True) else: _all_paths = all_paths(G, source_id, target_id, cutoff=max_hops, edges=True) for path in _all_paths: for edge_num, an_edge in enumerate(path): edge_id = G.properties[('e', 'id')][an_edge] node1: str = 'p%d' % path_id kw.write([ node1, str(edge_num), edge_id, '{}-{}-{}'.format(node1, edge_num, id_count) ]) id_count += 1 path_id += 1 if verbose: print("%d paths contining %d edges found." % (path_id, id_count), file=error_file, flush=True) kw.close() kr.close() except Exception as e: raise KGTKException('Error: ' + str(e))
def process(self): kmc: KgtkMergeColumns = KgtkMergeColumns() # Is the output file an edge file, a node file, or unknown? is_edge_file: bool = False is_node_file: bool = False krs: typing.List[KgtkReader] = [] kr: KgtkReader idx: int if self.verbose: print("Starting kgtkcat pid=%d" % (os.getpid()), file=self.error_file, flush=True) if self.verbose: print("Opening the %d input files." % len(self.input_file_paths), file=self.error_file, flush=True) saw_stdin: bool = False input_file_path: Path for idx, input_file_path in enumerate(self.input_file_paths): if str(input_file_path) == "-": if saw_stdin: raise ValueError("Duplicate standard input file %d" % (idx + 1)) else: saw_stdin = False if self.verbose: print("Opening file %d: standard input" % (idx + 1), file=self.error_file, flush=True) else: if self.verbose: print("Opening file %d: %s" % (idx + 1, str(input_file_path)), file=self.error_file, flush=True) kr = KgtkReader.open( input_file_path, who="input " + str(idx + 1), options=self.reader_options, value_options=self.value_options, error_file=self.error_file, verbose=self.verbose, very_verbose=self.very_verbose, ) krs.append(kr) # Unless directed otherwise, do not merge edge files with node # files. If options.mode == KgtkReaderMode.NONE, then neither # kr.is_edge_file nor kr.is_node_file will be set and the # consistency check will be skipped. if kr.is_edge_file: if is_node_file: # Close the open files before raising the exception. # # TODO: Use a try..finally block to ensure these files are closed. for kr2 in krs: kr2.close() raise ValueError( "Cannot merge an edge file to a node file: %s" % input_file_path) if is_edge_file == False and self.verbose: print("The output file will be an edge file.", file=self.error_file, flush=True) is_edge_file = True elif kr.is_node_file: if is_edge_file: # Close the open files before raising the exception. # # TODO: Use a try..finally block to ensure these files are closed. for kr2 in krs: kr2.close() raise ValueError( "Cannot merge a node file to an edge file: %s" % input_file_path) if is_node_file == False and self.verbose: print("The output file will be an node file.", file=self.error_file, flush=True) is_node_file = True if self.verbose or self.very_verbose: print("Mapping the %d column names in %s." % (len(kr.column_names), input_file_path), file=self.error_file, flush=True) if self.very_verbose: print(" ".join(kr.column_names), file=self.error_file, flush=True) new_column_names: typing.List[str] = kmc.merge(kr.column_names) if self.very_verbose: print(" ".join(new_column_names), file=self.error_file, flush=True) if self.verbose or self.very_verbose: print("There are %d merged columns." % len(kmc.column_names), file=self.error_file, flush=True) if self.very_verbose: print(" ".join(kmc.column_names), file=self.error_file, flush=True) if self.output_column_names is not None: if self.verbose: print("There are %d new output column names." % len(self.output_column_names), file=self.error_file, flush=True) if len(self.output_column_names) != len(kmc.column_names): # Close the open files before raising the exception. # # TODO: Use a try..finally block to ensure these files are closed. for kr2 in krs: kr2.close() raise ValueError( "There are %d merged columns, but %d output column names." % (len(kmc.column_names), len(self.output_column_names))) output_mode: KgtkWriter.Mode = KgtkWriter.Mode.NONE if is_edge_file: output_mode = KgtkWriter.Mode.EDGE if self.verbose: print("Opening the output edge file: %s" % str(self.output_path), file=self.error_file, flush=True) elif is_node_file: output_mode = KgtkWriter.Mode.NODE if self.verbose: print("Opening the output node file: %s" % str(self.output_path), file=self.error_file, flush=True) else: if self.verbose: print("Opening the output file: %s" % str(self.output_path), file=self.error_file, flush=True) ew: KgtkWriter = KgtkWriter.open( kmc.column_names, self.output_path, require_all_columns=False, prohibit_extra_columns=True, fill_missing_columns=True, use_mgzip=self.reader_options.use_mgzip, # Hack! mgzip_threads=self.reader_options.mgzip_threads, # Hack! gzip_in_parallel=False, mode=output_mode, output_format=self.output_format, output_column_names=self.output_column_names, old_column_names=self.old_column_names, new_column_names=self.new_column_names, verbose=self.verbose, very_verbose=self.very_verbose) output_data_lines: int = 0 for idx, kr in enumerate(krs): if kr.file_path is None: # This shouldn't happen because we constrined all # input_file_path elements to be not None. However, # checking here keeps mypy happy. # # TODO: throw a better exception. # # Close the open files before raising the exception. # # TODO: Use a try..finally block to ensure these files are closed. for kr2 in krs: kr2.close() raise ValueError("Missing file path.") input_file_path = kr.file_path if self.verbose: print("Copying data from file %d: %s" % (idx + 1, input_file_path), file=self.error_file, flush=True) shuffle_list: typing.List[int] = ew.build_shuffle_list( kmc.new_column_name_lists[idx]) input_data_lines: int = 0 row: typing.List[str] for row in kr: input_data_lines += 1 output_data_lines += 1 ew.write(row, shuffle_list=shuffle_list) # Flush the output file so far: ew.flush() if self.verbose: print("Read %d data lines from file %d: %s" % (input_data_lines, idx + 1, input_file_path), file=self.error_file, flush=True) if self.verbose: print("Wrote %d lines total from %d files" % (output_data_lines, len(krs)), file=self.error_file, flush=True) # Close the open files. ew.close() for kr2 in krs: kr2.close()
def run( input_file: KGTKFiles, output_file: KGTKFiles, errors_to_stdout: bool = False, errors_to_stderr: bool = True, show_options: bool = False, verbose: bool = False, very_verbose: bool = False, **kwargs # Whatever KgtkFileOptions and KgtkValueOptions want. ) -> int: # import modules locally from pathlib import Path import sys import typing from kgtk.exceptions import KGTKException from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions from kgtk.io.kgtkwriter import KgtkWriter from kgtk.reshape.kgtkidbuilder import KgtkIdBuilder, KgtkIdBuilderOptions from kgtk.value.kgtkvalueoptions import KgtkValueOptions input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file) output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file) # Select where to send error messages, defaulting to stderr. error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr # Build the option structures. idbuilder_options: KgtkIdBuilderOptions = KgtkIdBuilderOptions.from_dict( kwargs) reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs) value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs) # Show the final option structures for debugging and documentation. if show_options: print("--input-file=%s" % str(input_kgtk_file), file=error_file) print("--output-file=%s" % str(output_kgtk_file), file=error_file) idbuilder_options.show(out=error_file) reader_options.show(out=error_file) value_options.show(out=error_file) print("=======", file=error_file, flush=True) try: # First create the KgtkReader. It provides parameters used by the ID # column builder. Next, create the ID column builder, which provides a # possibly revised list of column names for the KgtkWriter. Create # the KgtkWriter. Last, process the data stream. # Open the input file. kr: KgtkReader = KgtkReader.open( input_kgtk_file, error_file=error_file, options=reader_options, value_options=value_options, verbose=verbose, very_verbose=very_verbose, ) # Create the ID builder. idb: KgtkIdBuilder = KgtkIdBuilder.new(kr, idbuilder_options) # Open the output file. ew: KgtkWriter = KgtkWriter.open(idb.column_names, output_kgtk_file, mode=KgtkWriter.Mode[kr.mode.name], require_all_columns=True, prohibit_extra_columns=True, fill_missing_columns=False, gzip_in_parallel=False, verbose=verbose, very_verbose=very_verbose) # Process the input file, building IDs. idb.process(kr, ew) # Clean up. ew.close() kr.close() return 0 except SystemExit as e: raise KGTKException("Exit requested") except Exception as e: raise KGTKException(str(e))
def process(self): # Open the input file. if self.verbose: if self.input_file_path is not None: print("Opening the input file: %s" % self.input_file_path, file=self.error_file, flush=True) else: print("Reading the input data from stdin", file=self.error_file, flush=True) kr: KgtkReader = KgtkReader.open( self.input_file_path, error_file=self.error_file, options=self.reader_options, value_options=self.value_options, verbose=self.verbose, very_verbose=self.very_verbose, ) self.id_column_idx = kr.id_column_idx # If requested, create the ID column builder. # Assemble the list of output column names. output_column_names: typing.List[str] idb: typing.Optional[KgtkIdBuilder] = None if self.build_id: if self.idbuilder_options is None: raise ValueError( "ID build requested but ID builder options are missing") idb = KgtkIdBuilder.new(kr, self.idbuilder_options) output_column_names = idb.column_names else: output_column_names = kr.column_names # Build the list of key column edges: key_idx_list: typing.List[int] = [] if len(self.key_column_names) == 0: if kr.is_edge_file: # Add the KGTK edge file required columns. key_idx_list.append(kr.node1_column_idx) key_idx_list.append(kr.label_column_idx) key_idx_list.append(kr.node2_column_idx) if not self.compact_id and kr.id_column_idx >= 0: key_idx_list.append(kr.id_column_idx) elif kr.is_node_file: # Add the KGTK node file required column: key_idx_list.append(kr.id_column_idx) else: raise ValueError( "The input file is neither an edge nor a node file. Key columns must be supplied." ) else: # Append columns to the list of key column indices, # silently removing duplicates, but complaining about unknown names. # # TODO: warn about duplicates? column_name: str for column_name in self.key_column_names: if column_name not in kr.column_name_map: raise ValueError("Column %s is not in the input file" % (repr(column_name))) key_idx: int = kr.column_name_map[column_name] if key_idx not in key_idx_list: key_idx_list.append(key_idx) if self.verbose: print("key indexes: %s" % " ".join([str(idx) for idx in key_idx_list]), file=self.error_file, flush=True) self.keep_first_idx_list.clear() if len(self.keep_first_names) > 0: keep_first_name: str for keep_first_name in self.keep_first_names: if keep_first_name not in kr.column_name_map: raise ValueError( "Keep first column %s is not in the input file" % (repr(keep_first_name))) keep_first_idx: int = kr.column_name_map[keep_first_name] if keep_first_idx in key_idx_list: raise ValueError( "Keep first column %s may not be a key column" % (repr(keep_first_name))) self.keep_first_idx_list.append(keep_first_idx) if self.verbose: print("keep first indexes: %s" % " ".join([str(idx) for idx in self.keep_first_idx_list]), file=self.error_file, flush=True) if self.deduplicate: if self.compact_id and kr.id_column_idx >= 0 and kr.id_column_idx not in self.keep_first_idx_list: self.keep_first_idx_list.append(kr.id_column_idx) # Any columns that aren't in the keep_first list and aren't # already in key_idx_list will be appended to key_idx_list: idx: int for idx in range(kr.column_count): if idx not in self.keep_first_idx_list and idx not in key_idx_list: key_idx_list.append(idx) if self.verbose: print("revised key indexes: %s" % " ".join([str(idx) for idx in key_idx_list]), file=self.error_file, flush=True) if self.verbose: key_idx_list_str: typing.List[str] = [] for key_idx in key_idx_list: key_idx_list_str.append(str(key_idx)) print("key indexes: %s" % " ".join(key_idx_list_str), file=self.error_file, flush=True) # Open the output file. ew: KgtkWriter = KgtkWriter.open( output_column_names, self.output_file_path, mode=kr.mode, require_all_columns=False, prohibit_extra_columns=True, fill_missing_columns=True, use_mgzip=self.reader_options.use_mgzip, # Hack! mgzip_threads=self.reader_options.mgzip_threads, # Hack! gzip_in_parallel=False, verbose=self.verbose, very_verbose=self.very_verbose) # Open the optional list output file. lew: typing.Optional[KgtkWriter] = None if self.list_output_file_path is not None: lew = KgtkWriter.open( output_column_names, self.list_output_file_path, mode=kr.mode, require_all_columns=False, prohibit_extra_columns=True, fill_missing_columns=True, use_mgzip=self.reader_options.use_mgzip, # Hack! mgzip_threads=self.reader_options.mgzip_threads, # Hack! gzip_in_parallel=False, verbose=self.verbose, very_verbose=self.very_verbose) input_line_count: int = 0 row: typing.List[str] = [] input_key: str prev_input_key: typing.Optional[str] = None going_up: typing.Optional[bool] = None if self.sorted_input: if self.verbose: print("Reading the input data from %s" % self.input_file_path, file=self.error_file, flush=True) for row in kr: input_line_count += 1 input_key = self.build_key(row, key_idx_list) if self.verify_sort: if prev_input_key is None: prev_input_key = input_key else: if going_up is None: if prev_input_key < input_key: going_up = True prev_input_key = input_key elif prev_input_key > input_key: going_up = False prev_input_key = input_key else: pass # No change in input key elif going_up: if prev_input_key < input_key: prev_input_key = input_key elif prev_input_key > input_key: raise ValueError( "Line %d sort violation going up: prev='%s' curr='%s'" % (input_line_count, prev_input_key.replace( self.field_separator, KgtkFormat.LIST_SEPARATOR), input_key.replace( self.field_separator, KgtkFormat.LIST_SEPARATOR))) else: pass # No change in input_key else: if prev_input_key > input_key: prev_input_key = input_key elif prev_input_key < input_key: raise ValueError( "Line %d sort violation going down: prev='%s' curr='%s'" % (input_line_count, prev_input_key.replace( self.field_separator, KgtkFormat.LIST_SEPARATOR), input_key.replace( self.field_separator, KgtkFormat.LIST_SEPARATOR))) else: pass # No change in input_key self.process_row(input_key, row, input_line_count, idb, ew, lew) else: if self.verbose: print("Sorting the input data from %s" % self.input_file_path, file=self.error_file, flush=True) # Map key values to lists of input and output data. input_map: typing.MutableMapping[ str, typing.List[typing.List[str]]] = {} for row in kr: input_line_count += 1 input_key = self.build_key(row, key_idx_list) if input_key in input_map: # Append the row to an existing list for that key. input_map[input_key].append(row) else: # Create a new list of rows for this key. input_map[input_key] = [row] if self.verbose: print("Processing the sorted input data", file=self.error_file, flush=True) for input_key in sorted(input_map.keys()): for row in input_map[input_key]: self.process_row(input_key, row, input_line_count, idb, ew, lew) # Flush the final row, if any. We pass the last row read for # feedback, such as an ID uniqueness violation. self.process_row("", row, input_line_count, idb, ew, lew, flush=True) if self.verbose: print("Read %d records, excluded %d records, wrote %d records." % (input_line_count, self.excluded_row_count, self.output_line_count), file=self.error_file, flush=True) if lew is not None: print("Wrote %d list ouput records." % (self.list_output_line_count), file=self.error_file, flush=True) ew.close() if lew is not None: lew.close()
def process(self): output_column_names: typing.List[str] if self.build_id and self.idbuilder_options is not None: self.idbuilder = KgtkIdBuilder.from_column_names( self.COLUMN_NAMES, self.idbuilder_options) output_column_names = self.idbuilder.column_names else: output_column_names = self.COLUMN_NAMES if self.verbose: print("Opening output file %s" % str(self.output_file_path), file=self.error_file, flush=True) # Open the output file. ew: KgtkWriter = KgtkWriter.open(output_column_names, self.output_file_path, mode=KgtkWriter.Mode.EDGE, require_all_columns=False, prohibit_extra_columns=True, fill_missing_columns=True, gzip_in_parallel=False, verbose=self.verbose, very_verbose=self.very_verbose) rw: typing.Optional[typing.TextIO] = None if self.reject_file_path is not None: if self.verbose: print("Opening reject file %s" % str(self.reject_file_path), file=self.error_file, flush=True) # Open the reject file. Since the input data is not in KGTK format, # we use an ordinary file here. if str(self.reject_file_path) == "-": rw = sys.stdout else: rw = open(self.reject_file_path, "wt") total_input_line_count: int = 0 reject_line_count: int = 0 namespace_line_count: int = self.get_initial_namespaces() input_file_path: str for input_file_path in self.input_file_paths: input_line_count: int = 0 if self.local_namespace_use_uuid or self.namespace_id_use_uuid or self.newnode_use_uuid: if self.override_uuid is not None: self.local_namespace_uuid = self.override_uuid # for debugging else: # Generate a new local namespace UUID. self.local_namespace_uuid = shortuuid.uuid() # Open the input file. if self.verbose: print("Opening the input file: %s" % input_file_path, file=self.error_file, flush=True) infile: typing.TestIO if str(input_file_path) == "-": infile = sys.stdin else: infile = open(input_file_path, 'rt') line: str for line in infile: input_line_count += 1 total_input_line_count += 1 row: typing.List[str] valid: bool row, valid = self.parse(line, input_line_count) if not valid: if rw is not None: rw.write(line) reject_line_count += 1 continue node1: str ok_1: bool node1, ok_1 = self.convert_and_validate( row[0], input_line_count, ew) label: str ok_2: bool label, ok_2 = self.convert_and_validate( row[1], input_line_count, ew) node2: str ok_3: bool node2, ok_3 = self.convert_and_validate( row[2], input_line_count, ew) if ok_1 and ok_2 and ok_3: self.write_row(ew, node1, label, node2) else: if rw is not None: rw.write(line) reject_line_count += 1 if input_file_path != "-": infile.close() self.save_namespaces(ew) if self.verbose: print("Processed %d known namespaces." % (namespace_line_count), file=self.error_file, flush=True) print("Processed %d records." % (total_input_line_count), file=self.error_file, flush=True) print("Rejected %d records." % (reject_line_count), file=self.error_file, flush=True) print("Wrote %d records." % (self.output_line_count), file=self.error_file, flush=True) if ew is not None: ew.close() if rw is not None and self.reject_file_path is not None and self.reject_file_path != "-": rw.close()
def run( input_file: KGTKFiles, output_file: KGTKFiles, undirected: bool, compute_degrees: bool, compute_pagerank: bool, compute_hits: bool, log_file: str, statistics_only: bool, vertex_in_degree: str, vertex_out_degree: str, vertex_pagerank: str, vertex_auth: str, vertex_hubs: str, top_n: int, errors_to_stdout: bool, errors_to_stderr: bool, show_options: bool, verbose: bool, very_verbose: bool, **kwargs, # Whatever KgtkFileOptions and KgtkValueOptions want. ): # import modules locally from pathlib import Path import sys from graph_tool import centrality from kgtk.exceptions import KGTKException import kgtk.gt.analysis_utils as gtanalysis from kgtk.gt.gt_load import load_graph_from_kgtk from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions from kgtk.io.kgtkwriter import KgtkWriter from kgtk.value.kgtkvalueoptions import KgtkValueOptions v_prop_dict = { 'vertex_pagerank': vertex_pagerank, 'vertex_hubs': vertex_hubs, 'vertex_auth': vertex_auth } try: # Select where to send error messages, defaulting to stderr. error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr # Build the option structures. reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs) value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs) input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file) output_kgtk_file: Path = KGTKArgumentParser.get_output_file( output_file) # hardcoded values useful for the script. Perhaps some of them should be exposed as arguments later directions = ['in', 'out', 'total'] id_col = 'name' output_columns = ["node1", "label", "node2", "id"] if verbose: print('loading the KGTK input file...\n', file=error_file, flush=True) kr: KgtkReader = KgtkReader.open( input_kgtk_file, error_file=error_file, options=reader_options, value_options=value_options, verbose=verbose, very_verbose=very_verbose, ) sub: int = kr.get_node1_column_index() if sub < 0: print("Missing node1 (subject) column.", file=error_file, flush=True) pred: int = kr.get_label_column_index() if pred < 0: print("Missing label (predicate) column.", file=error_file, flush=True) obj: int = kr.get_node2_column_index() if obj < 0: print("Missing node2 (object) column", file=error_file, flush=True) if sub < 0 or pred < 0 or obj < 0: kr.close() raise KGTKException("Exiting due to missing columns.") predicate: str = kr.column_names[pred] G2 = load_graph_from_kgtk(kr, directed=not undirected, ecols=(sub, obj), verbose=verbose, out=error_file) if verbose: print('graph loaded! It has %d nodes and %d edges.' % (G2.num_vertices(), G2.num_edges()), file=error_file, flush=True) kw: KgtkWriter = KgtkWriter.open(output_columns, output_kgtk_file, mode=KgtkWriter.Mode.EDGE, require_all_columns=True, prohibit_extra_columns=True, fill_missing_columns=False, verbose=verbose, very_verbose=very_verbose) with open(log_file, 'w') as writer: writer.write('graph loaded! It has %d nodes and %d edges\n' % (G2.num_vertices(), G2.num_edges())) writer.write('\n###Top relations:\n') for rel, freq in gtanalysis.get_topN_relations( G2, pred_property=predicate): writer.write('%s\t%d\n' % (rel, freq)) if compute_degrees: writer.write('\n###Degrees:\n') for direction in directions: degree_data = gtanalysis.compute_node_degree_hist( G2, direction) max_degree = len(degree_data) - 1 mean_degree, std_degree = gtanalysis.compute_avg_node_degree( G2, direction) writer.write( '%s degree stats: mean=%f, std=%f, max=%d\n' % (direction, mean_degree, std_degree, max_degree)) if compute_pagerank: writer.write('\n###PageRank\n') v_pr = G2.new_vertex_property('float') centrality.pagerank(G2, prop=v_pr) G2.properties[('v', 'vertex_pagerank')] = v_pr writer.write('Max pageranks\n') result = gtanalysis.get_topn_indices(G2, 'vertex_pagerank', top_n, id_col) for n_id, n_label, pr in result: writer.write('%s\t%s\t%f\n' % (n_id, n_label, pr)) if compute_hits: writer.write('\n###HITS\n') hits_eig, G2.vp['vertex_hubs'], G2.vp[ 'vertex_auth'] = gtanalysis.compute_hits(G2) writer.write('HITS hubs\n') main_hubs = gtanalysis.get_topn_indices( G2, 'vertex_hubs', top_n, id_col) for n_id, n_label, hubness in main_hubs: writer.write('%s\t%s\t%f\n' % (n_id, n_label, hubness)) writer.write('HITS auth\n') main_auth = gtanalysis.get_topn_indices( G2, 'vertex_auth', top_n, id_col) for n_id, n_label, authority in main_auth: writer.write('%s\t%s\t%f\n' % (n_id, n_label, authority)) id_count = 0 if not statistics_only: for e in G2.edges(): sid, oid = e lbl = G2.ep[predicate][e] kw.write([ G2.vp[id_col][sid], lbl, G2.vp[id_col][oid], '{}-{}-{}'.format(G2.vp[id_col][sid], lbl, id_count) ]) id_count += 1 id_count = 0 for v in G2.vertices(): v_id = G2.vp[id_col][v] kw.write([ v_id, vertex_in_degree, str(v.in_degree()), '{}-{}-{}'.format(v_id, vertex_in_degree, id_count) ]) id_count += 1 kw.write([ v_id, vertex_out_degree, str(v.out_degree()), '{}-{}-{}'.format(v_id, vertex_out_degree, id_count) ]) id_count += 1 for vprop in G2.vertex_properties.keys(): if vprop == id_col: continue kw.write([ v_id, v_prop_dict[vprop], str(G2.vp[vprop][v]), '{}-{}-{}'.format(v_id, v_prop_dict[vprop], id_count) ]) id_count += 1 kw.close() kr.close() except Exception as e: raise KGTKException('Error: ' + str(e))
def process(self): # Open the input file. if self.verbose: if self.input_file_path is not None: print("Opening the input file: %s" % self.input_file_path, file=self.error_file, flush=True) else: print("Reading the input data from stdin", file=self.error_file, flush=True) kr: KgtkReader = KgtkReader.open( self.input_file_path, error_file=self.error_file, options=self.reader_options, value_options=self.value_options, verbose=self.verbose, very_verbose=self.very_verbose, ) if self.column_name not in kr.column_name_map: raise ValueError("Column %s is not in the input file" % (self.column_name)) column_idx: int = kr.column_name_map[self.column_name] where_column_idx: int = -1 where_value_set: typing.Set[str] = {} if self.where_column_name is not None: if self.where_column_name not in kr.column_name_map: raise ValueError( "Where column '%s' is not in the input file." % (self.where_column_name)) where_column_idx = kr.column_name_map[self.where_column_name] if self.where_values is None or len(self.where_values) == 0: raise ValueError("Where column '%s' but no values to test." % (self.where_column_name)) else: where_value_set = set(self.where_values) if self.verbose: print("Counting unique values from the %s column in %s" % (self.column_name, self.input_file_path), file=self.error_file, flush=True) input_line_count: int = 0 skip_line_count: int = 0 value_counts: typing.MutableMapping[str, int] = {} row: typing.list[str] for row in kr: input_line_count += 1 if where_column_idx >= 0: if row[where_column_idx] not in where_value_set: skip_line_count += 1 continue value: str = row[column_idx] if len(value) == 0: value = self.empty_value if len(value) > 0: value = self.prefix + value value_counts[value] = value_counts.get(value, 0) + 1 if self.verbose: print( "Read %d records, skipped %d, found %d unique non-empty values, %d empty values." % (input_line_count, skip_line_count, len(value_counts), input_line_count - len(value_counts)), file=self.error_file, flush=True) # No node mode we can't open the output file until we are done reading # the input file, because we need the list of uniqueue values to # build the column list. output_columns: typing.List[str] if self.output_format == "edge": output_columns = ["node1", "label", "node2"] elif self.output_format == "node": output_columns = ["id"] for value in sorted(value_counts.keys()): # TODO: provide a way to override this check. if value in KgtkFormat.NODE1_COLUMN_NAMES: raise ValueError( "Cannot write a KGTK node file with a column named '%s'." % value) output_columns.append(value) else: raise ValueError("Unknown output format %s" % str(self.output_format)) if self.verbose: print("Opening the output file: %s" % self.output_file_path, file=self.error_file, flush=True) ew: KgtkWriter = KgtkWriter.open(output_columns, self.output_file_path, require_all_columns=False, prohibit_extra_columns=True, fill_missing_columns=True, gzip_in_parallel=False, verbose=self.verbose, very_verbose=self.very_verbose) if self.output_format == "edge": for value in sorted(value_counts.keys()): ew.write([value, self.label_value, str(value_counts[value])]) elif self.output_format == "node": row = [self.column_name] for value in sorted(value_counts.keys()): row.append(str(value_counts[value])) ew.write(row) else: raise ValueError("Unknown output format %s" % str(self.output_format)) ew.close()