def chunk(filename, total_lines, chunksize=500000): """Splits the provided file into equal chunks with ceiling(num_lines/chunksize) lines each. This takes the path to a file and reads through the file, splitting it into equal chunks with each of size ceiling(num_lines/chunksize). It then returns the number of chunks and sets up the raw_lines table in the format: (file, line num, line_chksum, raw_line) Args: filename (str): the file to split into chunks total_lines (int): the number of lines in the file at filename args (Namespace): args as populated namespace or 'None' for defaults chunksize (int): max size of a single chunk. Defaults to 500000. Returns: int: the number of chunks filename was split into """ #determine number of chunks if 'lincs.level4' in filename: num_chunks = MAX_CHUNKS else: num_chunks = math.ceil(total_lines / int(chunksize)) num_lines = int(total_lines / num_chunks) #determine file output information path, file = os.path.split(filename) chunk_dir = os.path.join(path, 'chunks') os.makedirs(chunk_dir, exist_ok=True) source_alias, ext = os.path.splitext(file) chunk_file = os.path.join(chunk_dir, source_alias + '.raw_line.') #divide file into chunks line_count = 0 with open(filename, 'rb') as infile: for i in range(1, num_chunks + 1): curr_chunk = chunk_file + str(i) + ext with open(curr_chunk, 'wb') as out: j = 0 for line in infile: line_count += 1 hasher = hashlib.md5() hasher.update(source_alias.encode()) hasher.update(str(line_count).encode()) hasher.update(line) md5 = hasher.hexdigest() outline = '\t'.join( (md5, str(line_count), source_alias, '')) out.write(outline.encode()) cleanline = line.decode("ascii", errors="ignore") cleanline = cleanline.replace('\n', '') cleanline = '"' + cleanline + '"\n' out.write(cleanline.encode()) j += 1 if j == num_lines and i < num_chunks: break u_chunk_file = curr_chunk.replace('raw_line', 'unique.raw_line') tu.csu(curr_chunk, u_chunk_file) return num_chunks
def create_mapping_dict(self, filename, key_col=3, value_col=4): """Return a mapping dictionary for the provided file. This returns a dictionary for use in mapping nodes or edge types from the file specified by filetype. By default it opens the file specified by filename creates a dictionary using the first column as the key and the second column as the value. Args: filename(str): The name of the file containing the information needed to produce the maping dictionary. Returns: dict: A dictionary for use in mapping nodes or edge types. """ term_map = dict() n_type = 'Property' n_meta_file = filename.replace('raw_line', 'node_meta') node_file = filename.replace('raw_line', 'node') orig_id, kn_id, orig_name, kn_name = ['', '', '', ''] skip = True with open(filename) as infile, \ open(n_meta_file, 'w') as n_meta, \ open(node_file, 'w') as nfile: reader = csv.reader(infile, delimiter='\t') n_meta_writer = csv.writer(n_meta, delimiter='\t', lineterminator='\n') n_writer = csv.writer(nfile, delimiter='\t', lineterminator='\n') for line in reader: raw = line[3] if raw.startswith('[Term]'): skip = False orig_id, kn_id, orig_name, kn_name = ['', '', '', ''] continue if raw.startswith('[Typedef]'): skip = True continue if skip: continue if raw.startswith('id: '): orig_id = raw[4:].strip() kn_id = cf.pretty_name(orig_id) continue if raw.startswith('name: '): orig_name = raw[6:].strip() kn_name = cf.pretty_name('go_' + orig_name) term_map[orig_id] = kn_id + '::' + kn_name n_writer.writerow([kn_id, kn_name, n_type]) n_meta_writer.writerow([kn_id, 'orig_desc', orig_name]) n_meta_writer.writerow([kn_id, 'orig_id', orig_id]) if raw.startswith('alt_id: '): alt_id = raw[8:].strip() term_map[alt_id] = kn_id + '::' + kn_name n_meta_writer.writerow([kn_id, 'alt_alias', alt_id]) outfile = node_file.replace('node', 'unique.node') tu.csu(node_file, outfile) outfile = n_meta_file.replace('node_meta', 'unique.node_meta') tu.csu(n_meta_file, outfile) return term_map
def create_mapping_dict(self, filename, key_col=3, value_col=4): """Return a mapping dictionary for the provided file. This returns a dictionary for use in mapping nodes or edge types from the file specified by filetype. By default it opens the file specified by filename creates a dictionary using the key_col column as the key and the value_col column as the value. Args: filename (str): The name of the file containing the information needed to produce the maping dictionary. key_col (int): The column containing the key for creating the dictionary. By default this is column 3. value_col (int): The column containing the value for creating the dictionary. By default this is column 4. Returns: dict: A dictionary for use in mapping nodes or edge types. """ src = filename.split('.')[0] alias = filename.split('.')[1] map_dict = dict() n_meta_file = filename.replace('raw_line', 'node_meta') node_file = filename.replace('raw_line', 'node') if not self.is_map(alias): return map_dict with open(filename, 'rb') as map_file, \ open(n_meta_file, 'w') as n_meta, \ open(node_file, 'w') as nfile: reader = csv.reader((line.decode('utf-8') for line in map_file), delimiter='\t') n_meta_writer = csv.writer(n_meta, delimiter='\t', lineterminator='\n') n_writer = csv.writer(nfile, delimiter='\t', lineterminator='\n') for line in reader: chksm = line[2] orig_id = line[key_col].strip() orig_name = line[value_col].strip() kn_id = cf.pretty_name(orig_id) kn_name = cf.pretty_name(src + '_' + orig_name) map_dict[orig_id] = kn_id + '::' + kn_name n_writer.writerow([kn_id, kn_name]) n_meta_writer.writerow([kn_id, 'orig_desc', orig_name]) n_meta_writer.writerow([kn_id, 'orig_id', orig_id]) outfile = node_file.replace('node', 'unique.node') tu.csu(node_file, outfile) outfile = n_meta_file.replace('node_meta', 'unique.node_meta') tu.csu(n_meta_file, outfile) return map_dict
def format_raw_line(filename): """Creates the raw_line table from the provided file and returns the path to the output file. This takes the path to a file and reads through the file, adding three tab separated columns to the beginning, saving to disk, and then returning the output file path. Output looks like: raw_lines table (line_hash, line_num, file_id, line_str) Args: filename (str): the file to convert to raw_line table format Returns: str: the path to the output file """ #determine file output information path, file = os.path.split(filename) source_alias, ext = os.path.splitext(file) raw_line = os.path.join(path, source_alias + '.raw_line' + ext) #convert the file to raw_line format line_count = 0 with open(filename, 'rb') as infile: with open(raw_line, 'wb') as outfile: for line in infile: line_count += 1 hasher = hashlib.md5() hasher.update(source_alias.encode()) hasher.update(str(line_count).encode()) hasher.update(line) md5 = hasher.hexdigest() outline = '\t'.join([md5, str(line_count), source_alias, '']) outfile.write(outline.encode()) cleanline = line.decode('ascii', 'ignore') outfile.write(cleanline.encode()) tu.csu(raw_line, raw_line.replace('raw_line', 'unique.raw_line'), [1, 2, 3]) return raw_line
def table(raw_line, version_dict, taxid_list=None): """Uses the provided raw_line file to produce a table file, an edge_meta file, a node and/or node_meta file (only for property nodes). This returns noting but produces the table formatted files from the provided raw_line file: raw_line (line_hash, line_num, file_id, raw_line) table_file (line_hash, n1name, n1hint, n1type, n1spec, n2name, n2hint, n2type, n2spec, et_hint, score, table_hash) edge_meta (line_hash, info_type, info_desc) node_meta (node_id, info_type (evidence, relationship, experiment, or link), info_desc (text)) node (node_id, n_alias, n_type) Args: raw_line(str): The path to the raw_line file version_dict (dict): A dictionary describing the attributes of the alias for a source. taxid_list (list): A list of taxon ids to support Returns: """ if taxid_list is None: taxid_list = [] #outfiles table_file = raw_line.replace('raw_line', 'table') e_meta_file = raw_line.replace('raw_line', 'edge_meta') #static column values n1type = 'gene' n2type = 'gene' score = 1 src_specific_hints = ["intact", "biogrid"] #mapping files ppi = os.path.join('..', '..', 'ppi', 'obo_map', 'ppi.obo_map.json') with open(ppi) as infile: term_map = json.load(infile) with open(raw_line, encoding='utf-8') as infile, \ open(table_file, 'w') as edges,\ open(e_meta_file, 'w') as e_meta: edge_writer = csv.writer(edges, delimiter='\t', lineterminator='\n') e_meta_writer = csv.writer(e_meta, delimiter='\t', lineterminator='\n') for line in infile: line = line.replace('"', '').strip().split('\t') if len(line) == 1: continue if line[1] == '1': continue chksm = line[0] raw = line[3:] n1list = raw[0].split('|') + raw[2].split('|') n2list = raw[1].split('|') + raw[3].split('|') if not n1list or not n2list: continue match = re.search(r'taxid:(\d+)', raw[9]) if match is not None: n1spec = match.group(1) if taxid_list and n1spec not in taxid_list: continue else: continue match = re.search(r'taxid:(\d+)', raw[10]) if match is not None: n2spec = match.group(1) if taxid_list and n2spec not in taxid_list: continue else: continue if len(raw) > 35 and raw[35].upper() == 'TRUE': et_hint = 'PPI_negative' else: match = re.search(r'(MI:\d+)', raw[11]) if match is not None: et_hint = term_map[match.group(1)] else: continue for n1tuple in n1list: if n1tuple.count(':') != 1: continue n1hint, n1id = n1tuple.split(':') if n1hint in src_specific_hints: continue for n2tuple in n2list: if n2tuple.count(':') != 1: continue n2hint, n2id = n2tuple.split(':') if n2hint in src_specific_hints: continue hasher = hashlib.md5() hasher.update('\t'.join([chksm, n1id, n1hint, n1type, n1spec,\ n2id, n2hint, n2type, n2spec, et_hint, str(score)]).encode()) t_chksum = hasher.hexdigest() edge_writer.writerow([chksm, n1id, n1hint, n1type, n1spec, \ n2id, n2hint, n2type, n2spec, et_hint, score, t_chksum]) publist = raw[8] interaction_id = raw[13] e_meta_writer.writerow([chksm, 'reference', publist]) e_meta_writer.writerow([chksm, 'detail', interaction_id]) outfile = e_meta_file.replace('edge_meta', 'unique.edge_meta') tu.csu(e_meta_file, outfile)
def table(self, raw_line, version_dict): """Uses the provided raw_line file to produce a 2table_edge file, an edge_meta file, a node and/or node_meta file (only for property nodes). This returns noting but produces the table formatted files from the provided raw_line file: raw_line (line_hash, line_num, file_id, raw_line) table_file (line_hash, n1name, n1hint, n1type, n1spec, n2name, n2hint, n2type, n2spec, et_hint, score, table_hash) edge_meta (line_hash, info_type, info_desc) node_meta (node_id, info_type (evidence, relationship, experiment, or link), info_desc (text)) node (node_id, n_alias, n_type) Args: raw_line(str): The path to the raw_line file version_dict (dict): A dictionary describing the attributes of the alias for a source. Returns: """ #outfiles table_file = raw_line.replace('raw_line', 'table') #n_meta_file = raw_line.replace('raw_line', 'node_meta') e_meta_file = raw_line.replace('raw_line', 'edge_meta') #static column values n1type = 'gene' n1hint = 'unknown' n2type = 'gene' n2hint = 'unknown' info_type = 'combined_score' edge_types = {2: 'STRING_neighborhood', 3: 'STRING_fusion', 4: 'STRING_cooccurence', 5: 'STRING_coexpression', 6: 'STRING_experimental', 7: 'STRING_database', 8: 'STRING_textmining'} with open(raw_line, encoding='utf-8') as infile, \ open(table_file, 'w') as edges,\ open(e_meta_file, 'w') as e_meta: edge_writer = csv.writer(edges, delimiter='\t', lineterminator='\n') e_meta_writer = csv.writer(e_meta, delimiter='\t', lineterminator='\n') for line in infile: line = line.replace('"', '').strip().split('\t') if line[1] == '1': continue chksm = line[0] raw = line[3].split(' ') n1list = raw[0].split('.') n2list = raw[1].split('.') if len(n1list) < 2 or len(n2list) < 2: continue n1spec = n1list[0] n1id = '.'.join(n1list[1:]) n2spec = n2list[0] n2id = '.'.join(n2list[1:]) for ety in edge_types: et_hint = edge_types[ety] score = raw[ety] if score == '0': continue hasher = hashlib.md5() hasher.update('\t'.join([chksm, n1id, n1hint, n1type, n1spec, n2id, n2hint, n2type, n2spec, et_hint, str(score)]).encode()) t_chksum = hasher.hexdigest() edge_writer.writerow([chksm, n1id, n1hint, n1type, n1spec, n2id, n2hint, n2type, n2spec, et_hint, score, t_chksum]) c_score = raw[9] e_meta_writer.writerow([chksm, info_type, c_score]) outfile = e_meta_file.replace('edge_meta', 'unique_edge_meta') tu.csu(e_meta_file, outfile, [1, 2, 3])
def table(self, raw_line, version_dict): """Uses the provided raw_line file to produce a 2table_edge file, an edge_meta file, a node and/or node_meta file (only for property nodes). This returns noting but produces the table formatted files from the provided raw_line file: raw_line (line_hash, line_num, file_id, raw_line) table_file (line_hash, n1name, n1hint, n1type, n1spec, n2name, n2hint, n2type, n2spec, et_hint, score, table_hash) edge_meta (line_hash, info_type, info_desc) node_meta (node_id, info_type (evidence, relationship, experiment, or link), info_desc (text)) node (node_id, n_alias, n_type) Args: raw_line(str): The path to the raw_line file version_dict (dict): A dictionary describing the attributes of the alias for a source. Returns: """ #outfiles table_file = raw_line.replace('raw_line', 'table') n_meta_file = raw_line.replace('raw_line', 'node_meta') node_file = raw_line.replace('raw_line', 'node') #e_meta_file = raw_line.replace('raw_line', 'edge_meta') #static column values n1type = 'property' n_type = 'Property' n2type = 'gene' n1hint = 'Pfam/Family' n2hint = 'Uniprot_gn' et_hint = 'pfam_prot' n1spec = '0' map_dict = dict() src = 'pf' ###Map the file name species = (os.path.join('..', '..', 'id_map', 'species', 'species.json')) with open(species) as infile: species_map = json.load(infile) n2spec = version_dict['alias'] with open(raw_line, encoding='utf-8') as infile, \ open(table_file, 'w') as edges, \ open(n_meta_file, 'w') as n_meta, \ open(node_file, 'w') as nfile: n_meta_writer = csv.writer(n_meta, delimiter='\t', lineterminator='\n') n_writer = csv.writer(nfile, delimiter='\t', lineterminator='\n') edge_writer = csv.writer(edges, delimiter='\t', lineterminator='\n') for line in infile: line = line.replace('"', '').strip().split() if len(line) == 1: continue chksm = line[0] raw = line[3:] # skip commented lines comment_match = re.match('#', raw[0]) if comment_match is not None: continue orig_id = raw[5].strip() orig_name = raw[6].strip() kn_id = cf.pretty_name(src + '_' + orig_id) kn_name = cf.pretty_name(src + '_' + orig_name) map_dict[orig_id] = kn_id + '::' + kn_name n_writer.writerow([kn_id, kn_name, n_type]) n_meta_writer.writerow([kn_id, 'orig_desc', orig_name]) n_meta_writer.writerow([kn_id, 'orig_id', orig_id]) n2orig = raw[0] evalue = raw[12] evalue = float(evalue) score = self.sc_min if evalue == 0.0: score = self.sc_max if evalue > 0.0: score = round(-1.0 * math.log10(evalue), 4) if score > self.sc_max: score = self.sc_max if score < self.sc_min: continue output = [ chksm, kn_id, n1hint, n1type, n1spec, n2orig, n2hint, n2type, n2spec, et_hint, str(score) ] hasher = hashlib.md5() hasher.update('\t'.join(output).encode()) t_chksum = hasher.hexdigest() edge_writer.writerow(output + [t_chksum]) outfile = node_file.replace('node', 'unique.node') tu.csu(node_file, outfile) outfile = n_meta_file.replace('node_meta', 'unique.node_meta') tu.csu(n_meta_file, outfile)
def main(tablefile, args=None): """Maps the nodes for the source:alias tablefile. This takes the path to an tablefile (see table_utilities.main) and maps the nodes in it using the Redis DB. It then outputs a status files in the format (table_hash, n1, n2, edge_type, weight, edge_hash, line_hash, status, status_desc), where status is production if both nodes mapped and unmapped otherwise. It also outpus an edge file which all rows where status is production, in the format (edge_hash, n1, n2, edge_type, weight), and and edge2line file in the formate (edge_hash, line_hash). Args: tablefile (str): path to an tablefile to be mapped args (Namespace): args as populated namespace or 'None' for defaults """ if args is None: args = cf.config_args() if 'lincs.level4' in tablefile or 'lincs.exp_meta' in tablefile: if os.path.isfile(tablefile.replace('conv', 'node')): iu.import_pnode(tablefile.replace('conv', 'node'), args) iu.import_edge(tablefile, args) return rdb = ru.get_database(args) edge_file = tablefile.replace('table', 'edge') status_file = tablefile.replace('table', 'status') ue_file = tablefile.replace('table', 'unique.edge') ue2l_file = tablefile.replace('table', 'unique.edge2line') us_file = tablefile.replace('table', 'unique.status') src_data_dir = os.path.join(args.working_dir, args.data_path, cf.DEFAULT_MAP_PATH) species_file = os.path.join(src_data_dir, 'species', 'species.json') with open(species_file, 'r') as infile: species_dict = json.load(infile) supported_taxids = ['unknown'] + list(species_dict.values()) with open(tablefile, 'r') as infile, \ open(edge_file, 'w') as edge, \ open(status_file, 'w') as e_stat: reader = csv.reader(infile, delimiter='\t') s_writer = csv.writer(e_stat, delimiter='\t', lineterminator='\n') e_writer = csv.writer(edge, delimiter='\t', lineterminator='\n') to_map = defaultdict(list) for line in reader: (n1, hint, ntype, taxid) = line[1:5] if ntype == 'gene' and taxid in supported_taxids: to_map[hint, taxid].append(n1) (n2, hint, ntype, taxid) = line[5:9] if ntype == 'gene' and taxid in supported_taxids: to_map[hint, taxid].append(n2) infile.seek(0) mapped = { k: {n: m for m, n in zip(ru.conv_gene(rdb, v, k[0], k[1]), v)} for k, v in to_map.items() } for line in reader: (n1, hint, ntype, taxid) = line[1:5] if ntype == 'gene': if taxid not in supported_taxids: n1_map = 'unmapped-unsupported-species' else: n1_map = mapped[hint, taxid][n1] else: n1_map = n1 (n2, hint, ntype, taxid) = line[5:9] if ntype == 'gene': if taxid not in supported_taxids: n2_map = 'unmapped-unsupported-species' else: n2_map = mapped[hint, taxid][n2] else: n2_map = n2 chksum = line[0] #line chksum et_map = line[9] weight = line[10] t_chksum = line[11] #raw edge chksum hasher = hashlib.md5() hasher.update('\t'.join([n1_map, n2_map, et_map]).encode()) e_chksum = hasher.hexdigest() if 'unmapped' in n1_map: status = 'unmapped' status_desc = n1_map elif 'unmapped' in n2_map: status = 'unmapped' status_desc = n2_map else: status = 'production' status_desc = 'mapped' e_writer.writerow([e_chksum, n1_map, n2_map, et_map, weight]) s_writer.writerow([t_chksum, n1_map, n2_map, et_map, weight, e_chksum, \ chksum, status, status_desc]) tu.csu(edge_file, ue_file) tu.csu(status_file, us_file) tu.csu(us_file, ue2l_file, [6, 7])
def table(self, raw_line, version_dict): """Uses the provided raw_line file to produce a 2table_edge file, an edge_meta file, a node and/or node_meta file (only for property nodes). This returns noting but produces the table formatted files from the provided raw_line file: raw_line (line_hash, line_num, file_id, raw_line) table_file (line_hash, n1name, n1hint, n1type, n1spec, n2name, n2hint, n2type, n2spec, et_hint, score, table_hash) edge_meta (line_hash, info_type, info_desc) node_meta (node_id, info_type (evidence, relationship, experiment, or link), info_desc (text)) node (node_id, n_alias, n_type) Args: raw_line(str): The path to the raw_line file version_dict (dict): A dictionary describing the attributes of the alias for a source. Returns: """ #outfiles table_file = raw_line.replace('raw_line', 'table') e_meta_file = raw_line.replace('raw_line', 'edge_meta') #static column values alias = version_dict['alias'] source = version_dict['source'] n1type = 'property' n1spec = '0' n2type = 'gene' info_type1 = 'reference' info_type2 = 'evidence' #mapping files obo_file = os.path.join('..', 'obo_map', 'go.obo_map.json') with open(obo_file) as infile: obo_map = json.load(infile) with open(raw_line, encoding='utf-8') as infile, \ open(table_file, 'w') as edges,\ open(e_meta_file, 'w') as e_meta: edge_writer = csv.writer(edges, delimiter='\t', lineterminator='\n') e_meta_writer = csv.writer(e_meta, delimiter='\t', lineterminator='\n') for line in infile: line = line.replace('"', '').strip().split('\t') if len(line) == 1: continue chksm = line[0] raw = line[3:] # skip commented lines comment_match = re.match('!', raw[0]) if comment_match is not None: continue qualifier = raw[3] # skip "NOT" annotations not_match = re.search('NOT', qualifier) if not_match is not None: continue n1orig = raw[4] n1_mapped = obo_map.get(n1orig, "unmapped:no-name::unmapped") (n1_id, n1hint) = n1_mapped.split('::') n2spec_str = raw[12].split("|", 1)[0].rstrip() #only take first species n2spec = n2spec_str.split(":", 1)[1] #remove label taxon: if n2spec == '559292': #manually overwrite taxid for Scer n2spec = '4932' reference = raw[5] anno_evidence = raw[6] score = 2 et_hint = 'gene_ontology' if anno_evidence == 'IEA': score = 1 n2_id = raw[1] n2hint = raw[0] if n2hint == "UniProtKB": n2hint = "uniprot_gn" if n1hint == "UniProtKB": n1hint = "uniprot_gn" for idx in range(1, 3): # loop twice hasher = hashlib.md5() hasher.update('\t'.join([chksm, n1_id, n1hint, n1type, n1spec,\ n2_id, n2hint, n2type, n2spec, et_hint, str(score)]).encode()) t_chksum = hasher.hexdigest() edge_writer.writerow([chksm, n1_id, n1hint, n1type, n1spec, \ n2_id, n2hint, n2type, n2spec, et_hint, score, t_chksum]) n2_id = raw[2] e_meta_writer.writerow([chksm, info_type1, reference]) e_meta_writer.writerow([chksm, info_type2, anno_evidence]) outfile = e_meta_file.replace('edge_meta', 'unique.edge_meta') tu.csu(e_meta_file, outfile)
def create_mapping_dict(self, filename, key_col=3, value_col=4): """Return a mapping dictionary for the provided file. This returns a dictionary for use in mapping nodes or edge types from the file specified by filetype. By default it opens the file specified by filename creates a dictionary using the first column as the key and the second column as the value. Args: filename(str): The name of the file containing the information needed to produce the maping dictionary. Returns: dict: A dictionary for use in mapping nodes or edge types. """ src = filename.split('.')[0] alias = filename.split('.')[1] map_dict = dict() n1_type = 'Property' n_meta_file = filename.replace('raw_line', 'node_meta') node_file = filename.replace('raw_line', 'node') if not self.is_map(alias): return map_dict if alias == 'pathway': with open(filename, 'rb') as map_file, \ open(n_meta_file, 'w') as n_meta, \ open(node_file, 'w') as nfile: reader = csv.reader( (line.decode('utf-8') for line in map_file), delimiter='\t') n_meta_writer = csv.writer(n_meta, delimiter='\t', lineterminator='\n') n_writer = csv.writer(nfile, delimiter='\t', lineterminator='\n') for line in reader: orig_id = line[3].strip() orig_name = line[4].strip() mod_id = src + '_' + orig_id.replace('map', '') kn_id = cf.pretty_name(mod_id) kn_name = cf.pretty_name(src + '_' + orig_name) map_dict[orig_id] = kn_id + '::' + kn_name n_writer.writerow([kn_id, kn_name, n1_type]) n_meta_writer.writerow([kn_id, 'orig_desc', orig_name]) n_meta_writer.writerow([kn_id, 'orig_id', orig_id]) outfile = node_file.replace('node', 'unique.node') tu.csu(node_file, outfile) outfile = n_meta_file.replace('node_meta', 'unique.node_meta') tu.csu(n_meta_file, outfile) else: with open(filename, 'rb') as map_file: reader = csv.reader( (line.decode('utf-8') for line in map_file), delimiter='\t') for line in reader: orig_id = line[3].strip() orig_name = line[4].strip() mod_id = src + '_' + orig_id kn_id = orig_name.split(':')[1] kn_name = 'EntrezGene' map_dict[mod_id] = kn_id + '::' + kn_name return map_dict
def table(self, raw_line, version_dict): """Uses the provided raw_line file to produce a 2table_edge file, an edge_meta file, a node and/or node_meta file (only for property nodes). This returns noting but produces the table formatted files from the provided raw_line file: raw_line (line_hash, line_num, file_id, raw_line) table_file (line_hash, n1name, n1hint, n1type, n1spec, n2name, n2hint, n2type, n2spec, et_hint, score, table_hash) edge_meta (line_hash, info_type, info_desc) node_meta (node_id, info_type (evidence, relationship, experiment, or link), info_desc (text)) node (node_id, n_alias, n_type) Args: raw_line(str): The path to the raw_line file version_dict (dict): A dictionary describing the attributes of the alias for a source. Returns: """ #outfiles table_file = raw_line.replace('raw_line', 'table') n_meta_file = raw_line.replace('raw_line', 'node_meta') node_file = raw_line.replace('raw_line', 'node') e_meta_file = raw_line.replace('raw_line', 'edge_meta') #static column values n1type = 'gene' #ignoring chemicals n1hint = 'UNIPROT_GN' n1spec = 'unknown' n2type = n1type #ignoring chemicals n2hint = n1hint n2spec = n1spec n3_type = 'property' n3hint = 'unknown' n3spec = 'unknown' score = '1' n_type = 'Property' with open(raw_line, encoding='utf-8') as infile, \ open(table_file, 'w') as edges,\ open(e_meta_file, 'w') as e_meta, \ open(n_meta_file, 'w') as n_meta, \ open(node_file, 'w') as nfile: edge_writer = csv.writer(edges, delimiter='\t', lineterminator='\n') e_meta_writer = csv.writer(e_meta, delimiter='\t', lineterminator='\n') n_meta_writer = csv.writer(n_meta, delimiter='\t', lineterminator='\n') n_writer = csv.writer(nfile, delimiter='\t', lineterminator='\n') for line in infile: line = line.replace('"', '').strip().split('\t') if line[1] == '1': #skip header continue chksm = line[0] raw = line[3:] if len(raw) != 7: #extended information continue (n1id, et_hint, n2id, src, publist, n3id, mediator_ids) = raw et_hint = 'pathcom_' + et_hint.replace('-', '_') #n1-n2 edge hasher = hashlib.md5() hasher.update('\t'.join([chksm, n1id, n1hint, n1type, n1spec, n2id, n2hint, n2type, n2spec, et_hint, score]).encode()) t_chksum = hasher.hexdigest() edge_writer.writerow([chksm, n1id, n1hint, n1type, n1spec, n2id, n2hint, n2type, n2spec, et_hint, score, t_chksum]) e_meta_writer.writerow([chksm, 'original_source', src]) if publist: e_meta_writer.writerow([chksm, 'reference', publist]) #pathway edge if n3id: kn_n3id = cf.pretty_name('paco_' + n3id) n_writer.writerow([kn_n3id, kn_n3id, n_type]) n_meta_writer.writerow([kn_n3id, 'orig_id', n3id]) for node in [n1id, n2id]: hasher = hashlib.md5() hasher.update('\t'.join([chksm, kn_n3id, n3hint, n3_type, n3spec, node, n1hint, n1type, n1spec, 'pathcom_pathway', score]).encode()) t_chksum = hasher.hexdigest() edge_writer.writerow([chksm, kn_n3id, n3hint, n3_type, n3spec, node, n1hint, n1type, n1spec, 'pathcom_pathway', score, t_chksum]) outfile = e_meta_file.replace('edge_meta', 'unique.edge_meta') tu.csu(e_meta_file, outfile, [1, 2, 3]) outfile = node_file.replace('node', 'unique.node') tu.csu(node_file, outfile) outfile = n_meta_file.replace('node_meta', 'unique.node_meta') tu.csu(n_meta_file, outfile)
def table(self, raw_line, version_dict): """Uses the provided raw_line file to produce a 2table_edge file, an edge_meta file, a node and/or node_meta file (only for property nodes). This returns noting but produces the table formatted files from the provided raw_line file: raw_line (line_hash, line_num, file_id, raw_line) table_file (line_hash, n1name, n1hint, n1type, n1spec, n2name, n2hint, n2type, n2spec, et_hint, score, table_hash) edge_meta (line_hash, info_type, info_desc) node_meta (node_id, info_type (evidence, relationship, experiment, or link), info_desc (text)) node (node_id, n_alias, n_type) Args: raw_line(str): The path to the raw_line file version_dict (dict): A dictionary describing the attributes of the alias for a source. Returns: """ #outfiles table_file = raw_line.replace('raw_line', 'table') n_meta_file = raw_line.replace('raw_line', 'node_meta') node_file = raw_line.replace('raw_line', 'node') #static column values alias = version_dict['alias'] source = version_dict['source'] mouse_aliases = ["MGI_Mammalian_Phenotype_2013", \ "MGI_Mammalian_Phenotype_Level_3",\ "MGI_Mammalian_Phenotype_Level_4", "Mouse_Gene_Atlas"] n1type = 'property' n_type = 'Property' n1spec = '0' n1hint = source + '_' + alias n2type = 'gene' if alias in mouse_aliases: n2spec = '10090' n2hint = 'MGI' else: n2spec = '9606' n2hint = 'HGNC' (et_hint, node_prefix) = self.aliases[alias].split('::') score = 1 if alias == 'PPI_Hub_Proteins': n1type = 'gene' n1spec = '9606' n1hint = 'HGNC' with open(raw_line, encoding='utf-8') as infile, \ open(table_file, 'w') as edges,\ open(n_meta_file, 'w') as n_meta, \ open(node_file, 'w') as nfile: edge_writer = csv.writer(edges, delimiter='\t', lineterminator='\n') n_meta_writer = csv.writer(n_meta, delimiter='\t', lineterminator='\n') n_writer = csv.writer(nfile, delimiter='\t', lineterminator='\n') for line in infile: line = line.replace('"', '').strip().split('\t') #line = re.split('\s{2,}', line) if len(line) == 1: continue chksm = line[0] raw = line[3:] n1_orig_name = raw[0] n1_kn_name = n1_orig_name if alias != 'PPI_Hub_Proteins': n1_kn_name = cf.pretty_name(node_prefix + '_' + n1_orig_name) n_meta_writer.writerow( [n1_kn_name, 'orig_desc', n1_orig_name]) n_writer.writerow([n1_kn_name, n1_kn_name, n_type]) for n2_id in raw[1:]: n2_id = n2_id.split(',')[0] if n2_id == '': continue hasher = hashlib.md5() hasher.update('\t'.join([chksm, n1_kn_name, n1hint, n1type, n1spec,\ n2_id, n2hint, n2type, n2spec, et_hint,\ str(score)]).encode()) t_chksum = hasher.hexdigest() edge_writer.writerow([chksm, n1_kn_name, n1hint, n1type, n1spec, \ n2_id, n2hint, n2type, n2spec, et_hint, score, \ t_chksum]) if alias != 'PPI_Hub_Proteins': outfile = n_meta_file.replace('node_meta', 'unique.node_meta') tu.csu(n_meta_file, outfile) outfile = node_file.replace('node', 'unique.node') tu.csu(node_file, outfile) else: os.remove(n_meta_file) os.remove(node_file)
def table(self, raw_line, version_dict): """Uses the provided raw_line file to produce a 2table_edge file, an edge_meta file, a node and/or node_meta file (only for property nodes). This returns noting but produces the table formatted files from the provided raw_line file: raw_line (line_hash, line_num, file_id, raw_line) table_file (line_hash, n1name, n1hint, n1type, n1spec, n2name, n2hint, n2type, n2spec, et_hint, score, table_hash) edge_meta (line_hash, info_type, info_desc) node_meta (node_id, info_type (evidence, relationship, experiment, or link), info_desc (text)) node (node_id, n_alias, n_type) Args: raw_line(str): The path to the raw_line file version_dict (dict): A dictionary describing the attributes of the alias for a source. Returns: """ alias = version_dict['alias'] #outfiles table_file = raw_line.replace('raw_line', 'table') n_meta_file = raw_line.replace('raw_line', 'node_meta') e_meta_file = raw_line.replace('raw_line', 'edge_meta') if alias == 'Ensembl2Reactome_All_Levels': #static column values n1type = 'property' n1spec = '0' n2type = 'gene' n2hint = 'Ensembl_GeneID' score = 1 #mapping files pathway = os.path.join('..', 'ReactomePathways', 'reactome.ReactomePathways.json') with open(pathway) as infile: path_map = json.load(infile) species = (os.path.join('..', '..', 'id_map', 'species', 'species.json')) with open(species) as infile: species_map = json.load(infile) with open(raw_line, encoding='utf-8') as infile, \ open(table_file, 'w') as edges,\ open(n_meta_file, 'w') as n_meta,\ open(e_meta_file, 'w') as e_meta: edge_writer = csv.writer(edges, delimiter='\t', lineterminator='\n') n_meta_writer = csv.writer(n_meta, delimiter='\t', lineterminator='\n') e_meta_writer = csv.writer(e_meta, delimiter='\t', lineterminator='\n') for line in infile: line = line.replace('"', '').strip().split('\t') if len(line) == 1: continue chksm = line[0] raw = line[3:] n1_orig_id = raw[1] n1_mapped = path_map.get(n1_orig_id, "unmapped:no-name::unmapped") (n1_id, n1hint) = n1_mapped.split('::') n1_link = raw[2] n2_id = raw[0] n2spec_str = raw[5] n2spec = species_map.get(n2spec_str, "unmapped:unsupported-species") e_meta = raw[4] score = 2 et_hint = 'reactome_annotation' if e_meta == 'IEA': score = 1 hasher = hashlib.md5() hasher.update('\t'.join([chksm, n1_id, n1hint, n1type, n1spec,\ n2_id, n2hint, n2type, n2spec, et_hint, str(score)]).encode()) t_chksum = hasher.hexdigest() edge_writer.writerow([chksm, n1_id, n1hint, n1type, n1spec, \ n2_id, n2hint, n2type, n2spec, et_hint, score, t_chksum]) n_meta_writer.writerow([n1_id, 'link', n1_link]) e_meta_writer.writerow([chksm, 'evidence', e_meta]) outfile = e_meta_file.replace('edge_meta', 'unique.edge_meta') tu.csu(e_meta_file, outfile) outfile = n_meta_file.replace('node_meta', 'unique.node_meta') tu.csu(n_meta_file, outfile) if alias == 'reactome.homo_sapiens.interactions.tab-delimited': #static column values n1type = 'gene' n1spec = '9606' n2type = 'gene' n2spec = '9606' score = 1 #mapping files with open(raw_line, encoding='utf-8') as infile, \ open(table_file, 'w') as edges,\ open(e_meta_file, 'w') as e_meta: edge_writer = csv.writer(edges, delimiter='\t', lineterminator='\n') e_meta_writer = csv.writer(e_meta, delimiter='\t', lineterminator='\n') for line in infile: line = line.replace('"', '').strip().split('\t') if len(line) == 1: continue chksm = line[0] raw = line[3:] # skip commented lines comment_match = re.match('#', raw[0]) if comment_match is not None: continue n1_str = raw[0] n1hint = n1_str.split(':', 1)[0] n1_id = n1_str.split(':', 1)[1] n2_str = raw[3] n2hint = n2_str.split(':', 1)[0] if n2hint == "": continue n2_id = n2_str.split(':', 1)[1] et_str = raw[6] et_hint = 'reactome_PPI_' + et_str detail_str = raw[7] hasher = hashlib.md5() hasher.update('\t'.join([chksm, n1_id, n1hint, n1type, n1spec,\ n2_id, n2hint, n2type, n2spec, et_hint, str(score)]).encode()) t_chksum = hasher.hexdigest() edge_writer.writerow([chksm, n1_id, n1hint, n1type, n1spec, \ n2_id, n2hint, n2type, n2spec, et_hint, score, t_chksum]) e_meta_writer.writerow([chksm, 'detail', detail_str]) if len(raw) > 8: ref_str = raw[8] e_meta_writer.writerow([chksm, 'reference', ref_str]) outfile = e_meta_file.replace('edge_meta', 'unique.edge_meta') tu.csu(e_meta_file, outfile)
def table(self, raw_line, version_dict): """Uses the provided raw_line file to produce a 2table_edge file, an edge_meta file, a node and/or node_meta file (only for property nodes). This returns noting but produces the table formatted files from the provided raw_line file: raw_line (line_hash, line_num, file_id, raw_line) table_file (line_hash, n1name, n1hint, n1type, n1spec, n2name, n2hint, n2type, n2spec, et_hint, score, table_hash) edge_meta (line_hash, info_type, info_desc) node_meta (node_id, info_type (evidence, relationship, experiment, or link), info_desc (text)) node (node_id, n_alias, n_type) Args: raw_line(str): The path to the raw_line file version_dict (dict): A dictionary describing the attributes of the alias for a source. Returns: """ #outfiles table_file = raw_line.replace('raw_line', 'table') n_meta_file = raw_line.replace('raw_line', 'node_meta') node_file = raw_line.replace('raw_line', 'node') #e_meta_file = raw_line.replace('raw_line','edge_meta') #static column values alias = version_dict['alias'] source = version_dict['source'] n1type = 'property' n_type = 'Property' n1spec = '0' n1hint = source + '_' + alias n2type = 'gene' n2spec = '9606' # assumption of human genes is occasionally incorrect n2hint = 'EntrezGene' et_hint = source + '_' + alias.replace(".", "_") score = 1 with open(raw_line, encoding='utf-8') as infile, \ open(table_file, 'w') as edges,\ open(n_meta_file, 'w') as n_meta, \ open(node_file, 'w') as nfile: edge_writer = csv.writer(edges, delimiter='\t', lineterminator='\n') n_meta_writer = csv.writer(n_meta, delimiter='\t', lineterminator='\n') n_writer = csv.writer(nfile, delimiter='\t', lineterminator='\n') for line in infile: line = line.replace('"', '').strip().split('\t') if len(line) == 1: continue chksm = line[0] raw = line[3:] n1_orig_name = raw[0] n1_url = raw[1] hasher = hashlib.md5() hasher.update(n1_orig_name.encode()) n1_chksum = hasher.hexdigest() n1_kn_id = cf.pretty_name('msig_' + n1_chksum) n1_kn_name = cf.pretty_name('msig_' + n1_orig_name) n1hint = n1_kn_name n_meta_writer.writerow([n1_kn_id, 'orig_desc', n1_orig_name]) n_meta_writer.writerow([n1_kn_id, 'link', n1_url]) n_writer.writerow([n1_kn_id, n1_kn_name, n_type]) for n2_id in raw[2:]: hasher = hashlib.md5() hasher.update('\t'.join([chksm, n1_kn_id, n1hint, n1type, n1spec,\ n2_id, n2hint, n2type, n2spec, et_hint,\ str(score)]).encode()) t_chksum = hasher.hexdigest() edge_writer.writerow([chksm, n1_kn_id, n1hint, n1type, n1spec, \ n2_id, n2hint, n2type, n2spec, et_hint, score, \ t_chksum]) outfile = n_meta_file.replace('node_meta', 'unique.node_meta') tu.csu(n_meta_file, outfile) outfile = node_file.replace('node', 'unique.node') tu.csu(node_file, outfile)