def create_mapping_dict(self, filename, key_col=3, value_col=4): """Return a mapping dictionary for the provided file. This returns a dictionary for use in mapping nodes or edge types from the file specified by filetype. By default it opens the file specified by filename creates a dictionary using the first column as the key and the second column as the value. Args: filename(str): The name of the file containing the information needed to produce the maping dictionary. Returns: dict: A dictionary for use in mapping nodes or edge types. """ term_map = dict() n_type = 'Property' n_meta_file = filename.replace('raw_line', 'node_meta') node_file = filename.replace('raw_line', 'node') orig_id, kn_id, orig_name, kn_name = ['', '', '', ''] skip = True with open(filename) as infile, \ open(n_meta_file, 'w') as n_meta, \ open(node_file, 'w') as nfile: reader = csv.reader(infile, delimiter='\t') n_meta_writer = csv.writer(n_meta, delimiter='\t', lineterminator='\n') n_writer = csv.writer(nfile, delimiter='\t', lineterminator='\n') for line in reader: raw = line[3] if raw.startswith('[Term]'): skip = False orig_id, kn_id, orig_name, kn_name = ['', '', '', ''] continue if raw.startswith('[Typedef]'): skip = True continue if skip: continue if raw.startswith('id: '): orig_id = raw[4:].strip() kn_id = cf.pretty_name(orig_id) continue if raw.startswith('name: '): orig_name = raw[6:].strip() kn_name = cf.pretty_name('go_' + orig_name) term_map[orig_id] = kn_id + '::' + kn_name n_writer.writerow([kn_id, kn_name, n_type]) n_meta_writer.writerow([kn_id, 'orig_desc', orig_name]) n_meta_writer.writerow([kn_id, 'orig_id', orig_id]) if raw.startswith('alt_id: '): alt_id = raw[8:].strip() term_map[alt_id] = kn_id + '::' + kn_name n_meta_writer.writerow([kn_id, 'alt_alias', alt_id]) outfile = node_file.replace('node', 'unique.node') tu.csu(node_file, outfile) outfile = n_meta_file.replace('node_meta', 'unique.node_meta') tu.csu(n_meta_file, outfile) return term_map
def create_mapping_dict(self, filename, key_col=3, value_col=4): """Return a mapping dictionary for the provided file. This returns a dictionary for use in mapping nodes or edge types from the file specified by filetype. By default it opens the file specified by filename creates a dictionary using the key_col column as the key and the value_col column as the value. Args: filename (str): The name of the file containing the information needed to produce the maping dictionary. key_col (int): The column containing the key for creating the dictionary. By default this is column 3. value_col (int): The column containing the value for creating the dictionary. By default this is column 4. Returns: dict: A dictionary for use in mapping nodes or edge types. """ src = filename.split('.')[0] alias = filename.split('.')[1] map_dict = dict() n_meta_file = filename.replace('raw_line', 'node_meta') node_file = filename.replace('raw_line', 'node') if not self.is_map(alias): return map_dict with open(filename, 'rb') as map_file, \ open(n_meta_file, 'w') as n_meta, \ open(node_file, 'w') as nfile: reader = csv.reader((line.decode('utf-8') for line in map_file), delimiter='\t') n_meta_writer = csv.writer(n_meta, delimiter='\t', lineterminator='\n') n_writer = csv.writer(nfile, delimiter='\t', lineterminator='\n') for line in reader: chksm = line[2] orig_id = line[key_col].strip() orig_name = line[value_col].strip() kn_id = cf.pretty_name(orig_id) kn_name = cf.pretty_name(src + '_' + orig_name) map_dict[orig_id] = kn_id + '::' + kn_name n_writer.writerow([kn_id, kn_name]) n_meta_writer.writerow([kn_id, 'orig_desc', orig_name]) n_meta_writer.writerow([kn_id, 'orig_id', orig_id]) outfile = node_file.replace('node', 'unique.node') tu.csu(node_file, outfile) outfile = n_meta_file.replace('node_meta', 'unique.node_meta') tu.csu(n_meta_file, outfile) return map_dict
def table(self, raw_line, version_dict): """Uses the provided raw_line file to produce a 2table_edge file, an edge_meta file, a node and/or node_meta file (only for property nodes). This returns noting but produces the table formatted files from the provided raw_line file: raw_line (line_hash, line_num, file_id, raw_line) table_file (line_hash, n1name, n1hint, n1type, n1spec, n2name, n2hint, n2type, n2spec, et_hint, score, table_hash) edge_meta (line_hash, info_type, info_desc) node_meta (node_id, info_type (evidence, relationship, experiment, or link), info_desc (text)) node (node_id, n_alias, n_type) Args: raw_line(str): The path to the raw_line file version_dict (dict): A dictionary describing the attributes of the alias for a source. Returns: """ #outfiles table_file = raw_line.replace('raw_line', 'table') n_meta_file = raw_line.replace('raw_line', 'node_meta') node_file = raw_line.replace('raw_line', 'node') #e_meta_file = raw_line.replace('raw_line', 'edge_meta') #static column values n1type = 'property' n_type = 'Property' n2type = 'gene' n1hint = 'Pfam/Family' n2hint = 'Uniprot_gn' et_hint = 'pfam_prot' n1spec = '0' map_dict = dict() src = 'pf' ###Map the file name species = (os.path.join('..', '..', 'id_map', 'species', 'species.json')) with open(species) as infile: species_map = json.load(infile) n2spec = version_dict['alias'] with open(raw_line, encoding='utf-8') as infile, \ open(table_file, 'w') as edges, \ open(n_meta_file, 'w') as n_meta, \ open(node_file, 'w') as nfile: n_meta_writer = csv.writer(n_meta, delimiter='\t', lineterminator='\n') n_writer = csv.writer(nfile, delimiter='\t', lineterminator='\n') edge_writer = csv.writer(edges, delimiter='\t', lineterminator='\n') for line in infile: line = line.replace('"', '').strip().split() if len(line) == 1: continue chksm = line[0] raw = line[3:] # skip commented lines comment_match = re.match('#', raw[0]) if comment_match is not None: continue orig_id = raw[5].strip() orig_name = raw[6].strip() kn_id = cf.pretty_name(src + '_' + orig_id) kn_name = cf.pretty_name(src + '_' + orig_name) map_dict[orig_id] = kn_id + '::' + kn_name n_writer.writerow([kn_id, kn_name, n_type]) n_meta_writer.writerow([kn_id, 'orig_desc', orig_name]) n_meta_writer.writerow([kn_id, 'orig_id', orig_id]) n2orig = raw[0] evalue = raw[12] evalue = float(evalue) score = self.sc_min if evalue == 0.0: score = self.sc_max if evalue > 0.0: score = round(-1.0 * math.log10(evalue), 4) if score > self.sc_max: score = self.sc_max if score < self.sc_min: continue output = [ chksm, kn_id, n1hint, n1type, n1spec, n2orig, n2hint, n2type, n2spec, et_hint, str(score) ] hasher = hashlib.md5() hasher.update('\t'.join(output).encode()) t_chksum = hasher.hexdigest() edge_writer.writerow(output + [t_chksum]) outfile = node_file.replace('node', 'unique.node') tu.csu(node_file, outfile) outfile = n_meta_file.replace('node_meta', 'unique.node_meta') tu.csu(n_meta_file, outfile)
def create_mapping_dict(self, filename, key_col=3, value_col=4): """Return a mapping dictionary for the provided file. This returns a dictionary for use in mapping nodes or edge types from the file specified by filetype. By default it opens the file specified by filename creates a dictionary using the first column as the key and the second column as the value. Args: filename(str): The name of the file containing the information needed to produce the maping dictionary. Returns: dict: A dictionary for use in mapping nodes or edge types. """ src = filename.split('.')[0] alias = filename.split('.')[1] map_dict = dict() n1_type = 'Property' n_meta_file = filename.replace('raw_line', 'node_meta') node_file = filename.replace('raw_line', 'node') if not self.is_map(alias): return map_dict if alias == 'pathway': with open(filename, 'rb') as map_file, \ open(n_meta_file, 'w') as n_meta, \ open(node_file, 'w') as nfile: reader = csv.reader( (line.decode('utf-8') for line in map_file), delimiter='\t') n_meta_writer = csv.writer(n_meta, delimiter='\t', lineterminator='\n') n_writer = csv.writer(nfile, delimiter='\t', lineterminator='\n') for line in reader: orig_id = line[3].strip() orig_name = line[4].strip() mod_id = src + '_' + orig_id.replace('map', '') kn_id = cf.pretty_name(mod_id) kn_name = cf.pretty_name(src + '_' + orig_name) map_dict[orig_id] = kn_id + '::' + kn_name n_writer.writerow([kn_id, kn_name, n1_type]) n_meta_writer.writerow([kn_id, 'orig_desc', orig_name]) n_meta_writer.writerow([kn_id, 'orig_id', orig_id]) outfile = node_file.replace('node', 'unique.node') tu.csu(node_file, outfile) outfile = n_meta_file.replace('node_meta', 'unique.node_meta') tu.csu(n_meta_file, outfile) else: with open(filename, 'rb') as map_file: reader = csv.reader( (line.decode('utf-8') for line in map_file), delimiter='\t') for line in reader: orig_id = line[3].strip() orig_name = line[4].strip() mod_id = src + '_' + orig_id kn_id = orig_name.split(':')[1] kn_name = 'EntrezGene' map_dict[mod_id] = kn_id + '::' + kn_name return map_dict
def table(self, raw_line, version_dict): """Uses the provided raw_line file to produce a 2table_edge file, an edge_meta file, a node and/or node_meta file (only for property nodes). This returns noting but produces the table formatted files from the provided raw_line file: raw_line (line_hash, line_num, file_id, raw_line) table_file (line_hash, n1name, n1hint, n1type, n1spec, n2name, n2hint, n2type, n2spec, et_hint, score, table_hash) edge_meta (line_hash, info_type, info_desc) node_meta (node_id, info_type (evidence, relationship, experiment, or link), info_desc (text)) node (node_id, n_alias, n_type) Args: raw_line(str): The path to the raw_line file version_dict (dict): A dictionary describing the attributes of the alias for a source. Returns: """ #outfiles table_file = raw_line.replace('raw_line', 'table') n_meta_file = raw_line.replace('raw_line', 'node_meta') node_file = raw_line.replace('raw_line', 'node') e_meta_file = raw_line.replace('raw_line', 'edge_meta') #static column values n1type = 'gene' #ignoring chemicals n1hint = 'UNIPROT_GN' n1spec = 'unknown' n2type = n1type #ignoring chemicals n2hint = n1hint n2spec = n1spec n3_type = 'property' n3hint = 'unknown' n3spec = 'unknown' score = '1' n_type = 'Property' with open(raw_line, encoding='utf-8') as infile, \ open(table_file, 'w') as edges,\ open(e_meta_file, 'w') as e_meta, \ open(n_meta_file, 'w') as n_meta, \ open(node_file, 'w') as nfile: edge_writer = csv.writer(edges, delimiter='\t', lineterminator='\n') e_meta_writer = csv.writer(e_meta, delimiter='\t', lineterminator='\n') n_meta_writer = csv.writer(n_meta, delimiter='\t', lineterminator='\n') n_writer = csv.writer(nfile, delimiter='\t', lineterminator='\n') for line in infile: line = line.replace('"', '').strip().split('\t') if line[1] == '1': #skip header continue chksm = line[0] raw = line[3:] if len(raw) != 7: #extended information continue (n1id, et_hint, n2id, src, publist, n3id, mediator_ids) = raw et_hint = 'pathcom_' + et_hint.replace('-', '_') #n1-n2 edge hasher = hashlib.md5() hasher.update('\t'.join([chksm, n1id, n1hint, n1type, n1spec, n2id, n2hint, n2type, n2spec, et_hint, score]).encode()) t_chksum = hasher.hexdigest() edge_writer.writerow([chksm, n1id, n1hint, n1type, n1spec, n2id, n2hint, n2type, n2spec, et_hint, score, t_chksum]) e_meta_writer.writerow([chksm, 'original_source', src]) if publist: e_meta_writer.writerow([chksm, 'reference', publist]) #pathway edge if n3id: kn_n3id = cf.pretty_name('paco_' + n3id) n_writer.writerow([kn_n3id, kn_n3id, n_type]) n_meta_writer.writerow([kn_n3id, 'orig_id', n3id]) for node in [n1id, n2id]: hasher = hashlib.md5() hasher.update('\t'.join([chksm, kn_n3id, n3hint, n3_type, n3spec, node, n1hint, n1type, n1spec, 'pathcom_pathway', score]).encode()) t_chksum = hasher.hexdigest() edge_writer.writerow([chksm, kn_n3id, n3hint, n3_type, n3spec, node, n1hint, n1type, n1spec, 'pathcom_pathway', score, t_chksum]) outfile = e_meta_file.replace('edge_meta', 'unique.edge_meta') tu.csu(e_meta_file, outfile, [1, 2, 3]) outfile = node_file.replace('node', 'unique.node') tu.csu(node_file, outfile) outfile = n_meta_file.replace('node_meta', 'unique.node_meta') tu.csu(n_meta_file, outfile)
def table(self, raw_line, version_dict): """Uses the provided raw_line file to produce a 2table_edge file, an edge_meta file, a node and/or node_meta file (only for property nodes). This returns noting but produces the table formatted files from the provided raw_line file: raw_line (line_hash, line_num, file_id, raw_line) table_file (line_hash, n1name, n1hint, n1type, n1spec, n2name, n2hint, n2type, n2spec, et_hint, score, table_hash) edge_meta (line_hash, info_type, info_desc) node_meta (node_id, info_type (evidence, relationship, experiment, or link), info_desc (text)) node (node_id, n_alias, n_type) Args: raw_line(str): The path to the raw_line file version_dict (dict): A dictionary describing the attributes of the alias for a source. Returns: """ #outfiles table_file = raw_line.replace('raw_line', 'table') n_meta_file = raw_line.replace('raw_line', 'node_meta') node_file = raw_line.replace('raw_line', 'node') #static column values alias = version_dict['alias'] source = version_dict['source'] mouse_aliases = ["MGI_Mammalian_Phenotype_2013", \ "MGI_Mammalian_Phenotype_Level_3",\ "MGI_Mammalian_Phenotype_Level_4", "Mouse_Gene_Atlas"] n1type = 'property' n_type = 'Property' n1spec = '0' n1hint = source + '_' + alias n2type = 'gene' if alias in mouse_aliases: n2spec = '10090' n2hint = 'MGI' else: n2spec = '9606' n2hint = 'HGNC' (et_hint, node_prefix) = self.aliases[alias].split('::') score = 1 if alias == 'PPI_Hub_Proteins': n1type = 'gene' n1spec = '9606' n1hint = 'HGNC' with open(raw_line, encoding='utf-8') as infile, \ open(table_file, 'w') as edges,\ open(n_meta_file, 'w') as n_meta, \ open(node_file, 'w') as nfile: edge_writer = csv.writer(edges, delimiter='\t', lineterminator='\n') n_meta_writer = csv.writer(n_meta, delimiter='\t', lineterminator='\n') n_writer = csv.writer(nfile, delimiter='\t', lineterminator='\n') for line in infile: line = line.replace('"', '').strip().split('\t') #line = re.split('\s{2,}', line) if len(line) == 1: continue chksm = line[0] raw = line[3:] n1_orig_name = raw[0] n1_kn_name = n1_orig_name if alias != 'PPI_Hub_Proteins': n1_kn_name = cf.pretty_name(node_prefix + '_' + n1_orig_name) n_meta_writer.writerow( [n1_kn_name, 'orig_desc', n1_orig_name]) n_writer.writerow([n1_kn_name, n1_kn_name, n_type]) for n2_id in raw[1:]: n2_id = n2_id.split(',')[0] if n2_id == '': continue hasher = hashlib.md5() hasher.update('\t'.join([chksm, n1_kn_name, n1hint, n1type, n1spec,\ n2_id, n2hint, n2type, n2spec, et_hint,\ str(score)]).encode()) t_chksum = hasher.hexdigest() edge_writer.writerow([chksm, n1_kn_name, n1hint, n1type, n1spec, \ n2_id, n2hint, n2type, n2spec, et_hint, score, \ t_chksum]) if alias != 'PPI_Hub_Proteins': outfile = n_meta_file.replace('node_meta', 'unique.node_meta') tu.csu(n_meta_file, outfile) outfile = node_file.replace('node', 'unique.node') tu.csu(node_file, outfile) else: os.remove(n_meta_file) os.remove(node_file)
def table(self, raw_line, version_dict): """Uses the provided raw_line file to produce a 2table_edge file, an edge_meta file, a node and/or node_meta file (only for property nodes). This returns noting but produces the table formatted files from the provided raw_line file: raw_line (line_hash, line_num, file_id, raw_line) table_file (line_hash, n1name, n1hint, n1type, n1spec, n2name, n2hint, n2type, n2spec, et_hint, score, table_hash) edge_meta (line_hash, info_type, info_desc) node_meta (node_id, info_type (evidence, relationship, experiment, or link), info_desc (text)) node (node_id, n_alias, n_type) Args: raw_line(str): The path to the raw_line file version_dict (dict): A dictionary describing the attributes of the alias for a source. Returns: """ #outfiles table_file = raw_line.replace('raw_line', 'table') n_meta_file = raw_line.replace('raw_line', 'node_meta') node_file = raw_line.replace('raw_line', 'node') #e_meta_file = raw_line.replace('raw_line','edge_meta') #static column values alias = version_dict['alias'] source = version_dict['source'] n1type = 'property' n_type = 'Property' n1spec = '0' n1hint = source + '_' + alias n2type = 'gene' n2spec = '9606' # assumption of human genes is occasionally incorrect n2hint = 'EntrezGene' et_hint = source + '_' + alias.replace(".", "_") score = 1 with open(raw_line, encoding='utf-8') as infile, \ open(table_file, 'w') as edges,\ open(n_meta_file, 'w') as n_meta, \ open(node_file, 'w') as nfile: edge_writer = csv.writer(edges, delimiter='\t', lineterminator='\n') n_meta_writer = csv.writer(n_meta, delimiter='\t', lineterminator='\n') n_writer = csv.writer(nfile, delimiter='\t', lineterminator='\n') for line in infile: line = line.replace('"', '').strip().split('\t') if len(line) == 1: continue chksm = line[0] raw = line[3:] n1_orig_name = raw[0] n1_url = raw[1] hasher = hashlib.md5() hasher.update(n1_orig_name.encode()) n1_chksum = hasher.hexdigest() n1_kn_id = cf.pretty_name('msig_' + n1_chksum) n1_kn_name = cf.pretty_name('msig_' + n1_orig_name) n1hint = n1_kn_name n_meta_writer.writerow([n1_kn_id, 'orig_desc', n1_orig_name]) n_meta_writer.writerow([n1_kn_id, 'link', n1_url]) n_writer.writerow([n1_kn_id, n1_kn_name, n_type]) for n2_id in raw[2:]: hasher = hashlib.md5() hasher.update('\t'.join([chksm, n1_kn_id, n1hint, n1type, n1spec,\ n2_id, n2hint, n2type, n2spec, et_hint,\ str(score)]).encode()) t_chksum = hasher.hexdigest() edge_writer.writerow([chksm, n1_kn_id, n1hint, n1type, n1spec, \ n2_id, n2hint, n2type, n2spec, et_hint, score, \ t_chksum]) outfile = n_meta_file.replace('node_meta', 'unique.node_meta') tu.csu(n_meta_file, outfile) outfile = node_file.replace('node', 'unique.node') tu.csu(node_file, outfile)