def read_tigr_roles(filename): """ Reads a truly loony file format that stores the tigrfam mainrole/subrole hierarchy. Returns a tuple containing a list of mainroles whose children are subroles and a dictionary from tigr role id to tigr roles, which can be used with the file TIGRFAMS_ROLE_LINK to attach TIGRFams to roles. """ mainroles = [] mainroles_by_name = {} mainroles_by_id = {} roles_by_id = {} with open(filename, 'r') as f: for line in f: fields = line.rstrip("\n").split("\t") role = OpenStruct() role.tigr_role_id = int(fields[1]) role.type = fields[2].rstrip(':') role.name = fields[3] if role.type=='mainrole': if role.name not in mainroles_by_name: mainroles.append(role) mainroles_by_name[role.name] = role role.children = [] mainroles_by_id[role.tigr_role_id] = mainroles_by_name[role.name] elif role.type=='sub1role': roles_by_id[role.tigr_role_id] = role else: raise "Unknown role type: " + role.type # add subroles to main roles for id in roles_by_id: mainroles_by_id[id].children.append(roles_by_id[id]) return (mainroles, roles_by_id,)
def read_genes(filename, chromosome=None, chromosome_map=None, rna=False): genes = [] with open(filename, 'r') as f: try: # first two lines hold title and column headers: title = f.next() # figure out chromosome from title if chromosome is None: for key in chromosome_map: if title.find(key) > -1: chromosome = chromosome_map[key] break if chromosome is None: raise Exception( "Can't figure out chromosome for: %s\ntitle=%s", filename, title) # parse out column headers columns = {} i = 0 for column in f.next().strip().split("\t"): columns[column] = i i += 1 except Exception as e: print "Error reading file: " + filename print str(type(e)) + ": " + str(e) return None try: # read line into objects for line in f: # strip leading and trailing whitespace line = line.strip() # skip blank lines if (len(line) == 0): continue fields = line.split("\t") gene = OpenStruct() gene.name = fields[columns['Locus_tag']] # locus tag if (fields[columns['Locus']] != '-'): gene.common_name = fields[columns['Locus']] # locus if 'Gi' in columns: gene.gi = int(fields[columns['Gi']]) gene.geneid = int(fields[columns['GeneID']]) gene.strand = fields[columns['Strand']] # '+' or '-' gene.start = int(fields[columns['Start']]) gene.end = int(fields[columns['End']]) if (fields[columns['Product Name']] != '-'): gene.description = fields[columns['Product Name']] # locus gene.chromosome = chromosome if rna: gene.type = guess_rna_gene_type(gene.description) else: gene.type = 'CDS' genes.append(gene) except Exception as e: print "Error reading line: " + line print str(type(e)) + ": " + str(e) return genes