def read_genes(filename, chromosome=None, chromosome_map=None, rna=False): genes = [] with open(filename, 'r') as f: try: # first two lines hold title and column headers: title = f.next() # figure out chromosome from title if chromosome is None: for key in chromosome_map: if title.find(key) > -1: chromosome = chromosome_map[key] break if chromosome is None: raise Exception( "Can't figure out chromosome for: %s\ntitle=%s", filename, title) # parse out column headers columns = {} i = 0 for column in f.next().strip().split("\t"): columns[column] = i i += 1 except Exception as e: print "Error reading file: " + filename print str(type(e)) + ": " + str(e) return None try: # read line into objects for line in f: # strip leading and trailing whitespace line = line.strip() # skip blank lines if (len(line) == 0): continue fields = line.split("\t") gene = OpenStruct() gene.name = fields[columns['Locus_tag']] # locus tag if (fields[columns['Locus']] != '-'): gene.common_name = fields[columns['Locus']] # locus if 'Gi' in columns: gene.gi = int(fields[columns['Gi']]) gene.geneid = int(fields[columns['GeneID']]) gene.strand = fields[columns['Strand']] # '+' or '-' gene.start = int(fields[columns['Start']]) gene.end = int(fields[columns['End']]) if (fields[columns['Product Name']] != '-'): gene.description = fields[columns['Product Name']] # locus gene.chromosome = chromosome if rna: gene.type = guess_rna_gene_type(gene.description) else: gene.type = 'CDS' genes.append(gene) except Exception as e: print "Error reading line: " + line print str(type(e)) + ": " + str(e) return genes