def read_tigr_roles(filename):
    """
    Reads a truly loony file format that stores the tigrfam mainrole/subrole hierarchy.
    Returns a tuple containing a list of mainroles whose children are subroles and
    a dictionary from tigr role id to tigr roles, which can be used with the file
    TIGRFAMS_ROLE_LINK to attach TIGRFams to roles.
    """
    mainroles = []
    mainroles_by_name = {}
    mainroles_by_id = {}
    roles_by_id = {}
    with open(filename, 'r') as f:
        for line in f:
            fields = line.rstrip("\n").split("\t")
            role = OpenStruct()
            role.tigr_role_id = int(fields[1])
            role.type = fields[2].rstrip(':')
            role.name = fields[3]
            if role.type=='mainrole':
                if role.name not in mainroles_by_name:
                    mainroles.append(role)
                    mainroles_by_name[role.name] = role
                    role.children = []
                mainroles_by_id[role.tigr_role_id] = mainroles_by_name[role.name]
            elif role.type=='sub1role':
                roles_by_id[role.tigr_role_id] = role
            else:
                raise "Unknown role type: " + role.type
    # add subroles to main roles
    for id in roles_by_id:
        mainroles_by_id[id].children.append(roles_by_id[id])
    return (mainroles, roles_by_id,)
def read_microbes_online_genome_info(filename):
    """
    Read a genomeInfo.txt file from microbes online.
    Return a list of gene objects.
    """
    with open(filename, 'r') as f:
        
        # we'll be making and returning a list of gene objects
        genes = []
        
        # read header and create map from column name to index
        # these files have these columns: locusId, accession, GI, scaffoldId, start, stop, strand,
        # sysName, name, desc, COG, COGFun, COGDesc, TIGRFam, TIGRRoles, GO, EC, ECDesc
        column_names = f.next().rstrip("\n").split("\t")
        column = { column_names[index]:index for index in range(0,len(column_names)) }
        
        for line in f:
            fields = line.rstrip("\n").split("\t")
            
            # create an object for each row
            gene = OpenStruct()
            for column_name in column_names:
                gene[column_name] = fields[ column[column_name] ]
            genes.append(gene)
        
    return genes
def read_cogs(filename):
    """
    Read COG functions.
    """
    cog_re = re.compile(r'\[(\w+)\]\s+(COG\d+)\s+(.*)')
    cogs = []
    with open(filename, 'r') as f:
        for line in f:
            m = cog_re.match(line)
            if m:
                cog = OpenStruct()
                cog.id = m.group(2)
                cog.name = m.group(3)
                cog.parents = m.group(1)
                cog.namespace = 'cog'
                cogs.append(cog)
    return cogs
def read_tigrfams_by_role(filename):
    """
    This is no longer used!
    Reads the hierarchical structure of TIGRFams organized into categories called roles.
    Returns a nested list structure of roles and sub-roles that hold tigrfams.
    Downloaded file from here: http://cmr.jcvi.org/tigr-scripts/CMR/shared/EvidenceList.cgi?ev_type=TIGRFAM&order_type=role
    Note the TIGRFams flat file is more complete than the TIGRFams by role file.
    """
    # we'll be making and returning a nested list of tigr roles holding tigrfams
    tigrfams_by_role = []

    with open(filename, 'r') as f:
        
        category = None
        subcategory = None
        
        for line in f:
            
            # skip blank lines
            if len(line.strip())==0:
                continue
            
            if line.startswith("      "):
                fields = line.lstrip(' ').rstrip("\n").split("\t")
                # skip column headers
                if fields[0] == 'Accession':
                    continue
                tigrfam = OpenStruct()
                tigrfam.id = fields[0]
                tigrfam.name = fields[1]
                tigrfam.description = fields[2]
                subcategory['tigrfams'].append(tigrfam)
            elif line.startswith("   "):
                name = line.strip()
                subcategory = {'name':name, 'tigrfams':[]}
                category['roles'].append(subcategory)
            else:
                name = line.strip()
                category = {'name':name, 'roles':[]}
                tigrfams_by_role.append(category)
        
    return tigrfams_by_role
def read_tigrfams_by_role(filename):
    """
    This is no longer used!
    Reads the hierarchical structure of TIGRFams organized into categories called roles.
    Returns a nested list structure of roles and sub-roles that hold tigrfams.
    Downloaded file from here: http://cmr.jcvi.org/tigr-scripts/CMR/shared/EvidenceList.cgi?ev_type=TIGRFAM&order_type=role
    Note the TIGRFams flat file is more complete than the TIGRFams by role file.
    """
    # we'll be making and returning a nested list of tigr roles holding tigrfams
    tigrfams_by_role = []

    with open(filename, 'r') as f:
        
        category = None
        subcategory = None
        
        for line in f:
            
            # skip blank lines
            if len(line.strip())==0:
                continue
            
            if line.startswith("      "):
                fields = line.lstrip(' ').rstrip("\n").split("\t")
                # skip column headers
                if fields[0] == 'Accession':
                    continue
                tigrfam = OpenStruct()
                tigrfam.id = fields[0]
                tigrfam.name = fields[1]
                tigrfam.description = fields[2]
                subcategory['tigrfams'].append(tigrfam)
            elif line.startswith("   "):
                name = line.strip()
                subcategory = {'name':name, 'tigrfams':[]}
                category['roles'].append(subcategory)
            else:
                name = line.strip()
                category = {'name':name, 'roles':[]}
                tigrfams_by_role.append(category)
        
    return tigrfams_by_role
def read_tigrfams(filename):
    """
    Read the flat listing of TIGRFams.
    Note the TIGRFams flat file is more complete than the TIGRFams by role file.
    The flat file is a superset of the by-role file.
    """
    tigrfams = []
    with open(filename, 'r') as f:

        #skip header
        line = f.next()

        for line in f:
            fields = line.rstrip("\n").split("\t")
            tigrfam = OpenStruct()
            tigrfam.id = fields[0]
            tigrfam.name = fields[1]
            tigrfam.description = fields[2]
            tigrfams.append(tigrfam)

    return tigrfams
def read_tigrfams(filename):
    """
    Read the flat listing of TIGRFams.
    Note the TIGRFams flat file is more complete than the TIGRFams by role file.
    The flat file is a superset of the by-role file.
    """
    tigrfams = []
    with open(filename, 'r') as f:

        #skip header
        line = f.next()

        for line in f:
            fields = line.rstrip("\n").split("\t")
            tigrfam = OpenStruct()
            tigrfam.id = fields[0]
            tigrfam.name = fields[1]
            tigrfam.description = fields[2]
            tigrfams.append(tigrfam)

    return tigrfams
def read_tigr_roles(filename):
    """
    Reads a truly loony file format that stores the tigrfam mainrole/subrole hierarchy.
    Returns a tuple containing a list of mainroles whose children are subroles and
    a dictionary from tigr role id to tigr roles, which can be used with the file
    TIGRFAMS_ROLE_LINK to attach TIGRFams to roles.
    """
    mainroles = []
    mainroles_by_name = {}
    mainroles_by_id = {}
    roles_by_id = {}
    with open(filename, 'r') as f:
        for line in f:
            fields = line.rstrip("\n").split("\t")
            role = OpenStruct()
            role.tigr_role_id = int(fields[1])
            role.type = fields[2].rstrip(':')
            role.name = fields[3]
            if role.type=='mainrole':
                if role.name not in mainroles_by_name:
                    mainroles.append(role)
                    mainroles_by_name[role.name] = role
                    role.children = []
                mainroles_by_id[role.tigr_role_id] = mainroles_by_name[role.name]
            elif role.type=='sub1role':
                roles_by_id[role.tigr_role_id] = role
            else:
                raise "Unknown role type: " + role.type
    # add subroles to main roles
    for id in roles_by_id:
        mainroles_by_id[id].children.append(roles_by_id[id])
    return (mainroles, roles_by_id,)
def read_cog_categories(filename):
    """
    Read COG functional categories (see http://www.ncbi.nlm.nih.gov/COG/grace/fiew.cgi)
    """
    cog_categories = []
    parent = None
    with open(filename, 'r') as f:
        for line in f:
            c = OpenStruct()
            if re.match("[A-Z]\t.*", line):
                fields = line.rstrip("\n").split("\t")
                c.id = fields[0]
                c.name = fields[3]
                c.parents = (parent,)
                c.namespace = "cog subcategory"
            else:
                c.name = line.rstrip("\n")
                c.namespace = "cog category"
                parent = c.name
            cog_categories.append(c)
    return cog_categories
def read_cogs(filename):
    """
    Read COG functions.
    """
    cog_re = re.compile(r'\[(\w+)\]\s+(COG\d+)\s+(.*)')
    cogs = []
    with open(filename, 'r') as f:
        for line in f:
            m = cog_re.match(line)
            if m:
                cog = OpenStruct()
                cog.id = m.group(2)
                cog.name = m.group(3)
                cog.parents = m.group(1)
                cog.namespace = 'cog'
                cogs.append(cog)
    return cogs
def read_cog_categories(filename):
    """
    Read COG functional categories (see http://www.ncbi.nlm.nih.gov/COG/grace/fiew.cgi)
    """
    cog_categories = []
    parent = None
    with open(filename, 'r') as f:
        for line in f:
            c = OpenStruct()
            if re.match("[A-Z]\t.*", line):
                fields = line.rstrip("\n").split("\t")
                c.id = fields[0]
                c.name = fields[3]
                c.parents = (parent,)
                c.namespace = "cog subcategory"
            else:
                c.name = line.rstrip("\n")
                c.namespace = "cog category"
                parent = c.name
            cog_categories.append(c)
    return cog_categories
Exemple #12
0
def read_genes(filename, chromosome=None, chromosome_map=None, rna=False):
    genes = []
    with open(filename, 'r') as f:
        try:
            # first two lines hold title and column headers:
            title = f.next()

            # figure out chromosome from title
            if chromosome is None:
                for key in chromosome_map:
                    if title.find(key) > -1:
                        chromosome = chromosome_map[key]
                        break

            if chromosome is None:
                raise Exception(
                    "Can't figure out chromosome for: %s\ntitle=%s", filename,
                    title)

            # parse out column headers
            columns = {}
            i = 0
            for column in f.next().strip().split("\t"):
                columns[column] = i
                i += 1
        except Exception as e:
            print "Error reading file: " + filename
            print str(type(e)) + ": " + str(e)
            return None

        try:
            # read line into objects
            for line in f:
                # strip leading and trailing whitespace
                line = line.strip()

                # skip blank lines
                if (len(line) == 0): continue

                fields = line.split("\t")

                gene = OpenStruct()
                gene.name = fields[columns['Locus_tag']]  # locus tag
                if (fields[columns['Locus']] != '-'):
                    gene.common_name = fields[columns['Locus']]  # locus
                if 'Gi' in columns:
                    gene.gi = int(fields[columns['Gi']])
                gene.geneid = int(fields[columns['GeneID']])
                gene.strand = fields[columns['Strand']]  # '+' or '-'
                gene.start = int(fields[columns['Start']])
                gene.end = int(fields[columns['End']])
                if (fields[columns['Product Name']] != '-'):
                    gene.description = fields[columns['Product Name']]  # locus
                gene.chromosome = chromosome

                if rna:
                    gene.type = guess_rna_gene_type(gene.description)
                else:
                    gene.type = 'CDS'

                genes.append(gene)
        except Exception as e:
            print "Error reading line: " + line
            print str(type(e)) + ": " + str(e)
    return genes
def read_go_terms(filename):
    """
    Read a file of GO (gene ontology) terms and return a list of term objects.
    Based on the ontology file OBO v1.2 downloaded from http://www.geneontology.org/.
    format-version: 1.2
    date: 27:10:2011 14:45
    saved-by: gwg
    auto-generated-by: OBO-Edit 2.1-rc2
    remark: cvs version: $Revision: 1.2357 $
    keys are in { id, alt_id,
                  name, namespace,
                  def, comment,
                  created_by, creation_date,
                  is_obsolete, replaced_by, consider,
                  synonym, is_a, subset, disjoint_from, relationship, intersection_of,
                  xref }
    """

    # See format description: http://www.geneontology.org/GO.format.obo-1_2.shtml#S.1.1
    # Tag-Value Pairs:
    #   <tag>: <value> {<trailing modifiers>} ! <comment>
    # Trailing Modifiers
    # {<name>=<value>, <name=value>, <name=value>}

    # we'll be making and returning a list of term objects
    terms = []

    with open(filename, 'r') as f:
        
        in_term_stanza = False
        # match quoted string values
        quoted_string_re = re.compile(r'"((?:[^"\\]|\\.)*)"(?:\s+(.*?))?')
        
        for line in f:
            
            line = line.rstrip("\n")
            if in_term_stanza:
                if line=="":
                    # blank line ends stanza
                    in_term_stanza = False
                else:
                    # capture a key/value pair
                    [key,rest] = line.split(': ',1)
                    # remove comments
                    comment_index = rest.rfind(' ! ')
                    if comment_index > -1:
                        value = rest[0:comment_index]
                    else:
                        value = rest
                    
                    # deal with quoted strings
                    # here, we're implicitely dropping suffixes from synonym and def
                    # lines that look like these:
                    # EXACT [GOC:obol]
                    # EXACT [EC:4.1.1.18]
                    # BROAD [EC:1.1.5.4]
                    # NARROW [EC:2.7.8.7]
                    # [GOC:bf, GOC:signaling, PMID:15084302, PMID:17662591]
                    m = quoted_string_re.match(value)
                    if m:
                        value = m.group(1)
                    
                    term.set_or_append(key,value)
            
            elif line=="[Term]":
                in_term_stanza = True
                term = OpenStruct()
                terms.append(term)

    return terms