def read_tigr_roles(filename): """ Reads a truly loony file format that stores the tigrfam mainrole/subrole hierarchy. Returns a tuple containing a list of mainroles whose children are subroles and a dictionary from tigr role id to tigr roles, which can be used with the file TIGRFAMS_ROLE_LINK to attach TIGRFams to roles. """ mainroles = [] mainroles_by_name = {} mainroles_by_id = {} roles_by_id = {} with open(filename, 'r') as f: for line in f: fields = line.rstrip("\n").split("\t") role = OpenStruct() role.tigr_role_id = int(fields[1]) role.type = fields[2].rstrip(':') role.name = fields[3] if role.type=='mainrole': if role.name not in mainroles_by_name: mainroles.append(role) mainroles_by_name[role.name] = role role.children = [] mainroles_by_id[role.tigr_role_id] = mainroles_by_name[role.name] elif role.type=='sub1role': roles_by_id[role.tigr_role_id] = role else: raise "Unknown role type: " + role.type # add subroles to main roles for id in roles_by_id: mainroles_by_id[id].children.append(roles_by_id[id]) return (mainroles, roles_by_id,)
def read_microbes_online_genome_info(filename): """ Read a genomeInfo.txt file from microbes online. Return a list of gene objects. """ with open(filename, 'r') as f: # we'll be making and returning a list of gene objects genes = [] # read header and create map from column name to index # these files have these columns: locusId, accession, GI, scaffoldId, start, stop, strand, # sysName, name, desc, COG, COGFun, COGDesc, TIGRFam, TIGRRoles, GO, EC, ECDesc column_names = f.next().rstrip("\n").split("\t") column = { column_names[index]:index for index in range(0,len(column_names)) } for line in f: fields = line.rstrip("\n").split("\t") # create an object for each row gene = OpenStruct() for column_name in column_names: gene[column_name] = fields[ column[column_name] ] genes.append(gene) return genes
def read_cogs(filename): """ Read COG functions. """ cog_re = re.compile(r'\[(\w+)\]\s+(COG\d+)\s+(.*)') cogs = [] with open(filename, 'r') as f: for line in f: m = cog_re.match(line) if m: cog = OpenStruct() cog.id = m.group(2) cog.name = m.group(3) cog.parents = m.group(1) cog.namespace = 'cog' cogs.append(cog) return cogs
def read_tigrfams_by_role(filename): """ This is no longer used! Reads the hierarchical structure of TIGRFams organized into categories called roles. Returns a nested list structure of roles and sub-roles that hold tigrfams. Downloaded file from here: http://cmr.jcvi.org/tigr-scripts/CMR/shared/EvidenceList.cgi?ev_type=TIGRFAM&order_type=role Note the TIGRFams flat file is more complete than the TIGRFams by role file. """ # we'll be making and returning a nested list of tigr roles holding tigrfams tigrfams_by_role = [] with open(filename, 'r') as f: category = None subcategory = None for line in f: # skip blank lines if len(line.strip())==0: continue if line.startswith(" "): fields = line.lstrip(' ').rstrip("\n").split("\t") # skip column headers if fields[0] == 'Accession': continue tigrfam = OpenStruct() tigrfam.id = fields[0] tigrfam.name = fields[1] tigrfam.description = fields[2] subcategory['tigrfams'].append(tigrfam) elif line.startswith(" "): name = line.strip() subcategory = {'name':name, 'tigrfams':[]} category['roles'].append(subcategory) else: name = line.strip() category = {'name':name, 'roles':[]} tigrfams_by_role.append(category) return tigrfams_by_role
def read_tigrfams(filename): """ Read the flat listing of TIGRFams. Note the TIGRFams flat file is more complete than the TIGRFams by role file. The flat file is a superset of the by-role file. """ tigrfams = [] with open(filename, 'r') as f: #skip header line = f.next() for line in f: fields = line.rstrip("\n").split("\t") tigrfam = OpenStruct() tigrfam.id = fields[0] tigrfam.name = fields[1] tigrfam.description = fields[2] tigrfams.append(tigrfam) return tigrfams
def read_cog_categories(filename): """ Read COG functional categories (see http://www.ncbi.nlm.nih.gov/COG/grace/fiew.cgi) """ cog_categories = [] parent = None with open(filename, 'r') as f: for line in f: c = OpenStruct() if re.match("[A-Z]\t.*", line): fields = line.rstrip("\n").split("\t") c.id = fields[0] c.name = fields[3] c.parents = (parent,) c.namespace = "cog subcategory" else: c.name = line.rstrip("\n") c.namespace = "cog category" parent = c.name cog_categories.append(c) return cog_categories
def read_genes(filename, chromosome=None, chromosome_map=None, rna=False): genes = [] with open(filename, 'r') as f: try: # first two lines hold title and column headers: title = f.next() # figure out chromosome from title if chromosome is None: for key in chromosome_map: if title.find(key) > -1: chromosome = chromosome_map[key] break if chromosome is None: raise Exception( "Can't figure out chromosome for: %s\ntitle=%s", filename, title) # parse out column headers columns = {} i = 0 for column in f.next().strip().split("\t"): columns[column] = i i += 1 except Exception as e: print "Error reading file: " + filename print str(type(e)) + ": " + str(e) return None try: # read line into objects for line in f: # strip leading and trailing whitespace line = line.strip() # skip blank lines if (len(line) == 0): continue fields = line.split("\t") gene = OpenStruct() gene.name = fields[columns['Locus_tag']] # locus tag if (fields[columns['Locus']] != '-'): gene.common_name = fields[columns['Locus']] # locus if 'Gi' in columns: gene.gi = int(fields[columns['Gi']]) gene.geneid = int(fields[columns['GeneID']]) gene.strand = fields[columns['Strand']] # '+' or '-' gene.start = int(fields[columns['Start']]) gene.end = int(fields[columns['End']]) if (fields[columns['Product Name']] != '-'): gene.description = fields[columns['Product Name']] # locus gene.chromosome = chromosome if rna: gene.type = guess_rna_gene_type(gene.description) else: gene.type = 'CDS' genes.append(gene) except Exception as e: print "Error reading line: " + line print str(type(e)) + ": " + str(e) return genes
def read_go_terms(filename): """ Read a file of GO (gene ontology) terms and return a list of term objects. Based on the ontology file OBO v1.2 downloaded from http://www.geneontology.org/. format-version: 1.2 date: 27:10:2011 14:45 saved-by: gwg auto-generated-by: OBO-Edit 2.1-rc2 remark: cvs version: $Revision: 1.2357 $ keys are in { id, alt_id, name, namespace, def, comment, created_by, creation_date, is_obsolete, replaced_by, consider, synonym, is_a, subset, disjoint_from, relationship, intersection_of, xref } """ # See format description: http://www.geneontology.org/GO.format.obo-1_2.shtml#S.1.1 # Tag-Value Pairs: # <tag>: <value> {<trailing modifiers>} ! <comment> # Trailing Modifiers # {<name>=<value>, <name=value>, <name=value>} # we'll be making and returning a list of term objects terms = [] with open(filename, 'r') as f: in_term_stanza = False # match quoted string values quoted_string_re = re.compile(r'"((?:[^"\\]|\\.)*)"(?:\s+(.*?))?') for line in f: line = line.rstrip("\n") if in_term_stanza: if line=="": # blank line ends stanza in_term_stanza = False else: # capture a key/value pair [key,rest] = line.split(': ',1) # remove comments comment_index = rest.rfind(' ! ') if comment_index > -1: value = rest[0:comment_index] else: value = rest # deal with quoted strings # here, we're implicitely dropping suffixes from synonym and def # lines that look like these: # EXACT [GOC:obol] # EXACT [EC:4.1.1.18] # BROAD [EC:1.1.5.4] # NARROW [EC:2.7.8.7] # [GOC:bf, GOC:signaling, PMID:15084302, PMID:17662591] m = quoted_string_re.match(value) if m: value = m.group(1) term.set_or_append(key,value) elif line=="[Term]": in_term_stanza = True term = OpenStruct() terms.append(term) return terms