Esempio n. 1
0
File: IO.py Progetto: wkpalan/aigo
def extract_Affy(fileName,
                 G,
                 refSet=None,
                 GO_columns=[30, 31, 32],
                 filetype="Affy",
                 delimiter=',',
                 quoting=csv.QUOTE_ALL):
    fileName = checkForZip(fileName)
    if (not os.path.exists(fileName)):
        raise IOError(fileName + " does not exist and is required ")

    #sniff and seek dialect
    csvfile = readFile(fileName)

    hasRef = None
    if refSet:
        hasRef = dict(izip(refSet, refSet))

    GenetoGO, GOtoGene = dict(), dict()
    for aspect in G.aspect:
        GenetoGO[aspect], GOtoGene[aspect] = dict(), dict()

    #Skip comments
    row = csvfile.readline()
    while row[0] == '#':
        row = csvfile.readline()

    #Read Header
    header = row
    #rd=list(csv.reader(f))

    csv.register_dialect('format', delimiter=delimiter, quoting=quoting)
    rd = csv.reader(csvfile, dialect='format')
    for row in rd:
        #Read gene product id
        g = row[0]

        if hasRef and not hasRef.has_key(g):
            logger.handleWarning(
                "gene product %s is not in the reference set, skip it " % g)
            continue

        for aspect, i in zip(
            ['biological_process', 'cellular_component', 'molecular_function'],
                GO_columns):
            for item in row[i].split('///'):
                if not item == "---" and not len(item.strip()) == 0:
                    go = "GO:%07d" % int(item.split('//')[0].replace('/', ''))

                    go, aspect = G.get_GOAlternative(go, nameSpace=True)
                    if not aspect:
                        logger.handleWarning(
                            "term %s is not in GO graph, skip it " % go)
                        continue
                    GenetoGO[aspect].setdefault(g, set([])).add(go)
                    GOtoGene[aspect].setdefault(go, set([])).add(g)

    return GenetoGO, GOtoGene
Esempio n. 2
0
File: IO.py Progetto: wkpalan/aigo
def extract_GO2GP(fileName,
                  G,
                  refSet=None,
                  sep1='\t',
                  sep2=',',
                  comments='#',
                  skiprows=0):
    """
    Read a functional annotation mapping file of the form
    GO_ID sep Gene_ID,Gene_ID
    sep is \t by default
    """

    fileName = str(checkForZip(fileName))
    if (not os.path.exists(fileName)):
        raise IOError(fileName + " does not exist and is required ")

    hasRef = None
    if refSet:
        hasRef = dict(izip(refSet, refSet))

    GenetoGO, GOtoGene = dict(), dict()
    for aspect in G.aspect:
        GenetoGO[aspect], GOtoGene[aspect] = dict(), dict()

    data = loadtxt(fileName,
                   dtype="S",
                   usecols=[0, 1],
                   delimiter=sep1,
                   comments=comments,
                   skiprows=skiprows)

    for go, GP in data:

        if not go.find('GO:') == 0:
            continue

        #Get the alternative term if any and its GO aspect
        term, aspect = G.get_GOAlternative(go, nameSpace=True)

        if not aspect:
            logger.handleWarning("term %s is not in GO graph, skip it " %
                                 (term))
            continue

        for gp in GP.split(sep2):
            gp = gp.strip()

            if not hasRef is None and not hasRef.has_key(gp):
                logger.handleWarning(
                    "gene product %s is not in the reference set, skip it " %
                    gp)
                continue

            GenetoGO[aspect].setdefault(gp, set([])).add(go)
            GOtoGene[aspect].setdefault(go, set([])).add(gp)

    return GenetoGO, GOtoGene
Esempio n. 3
0
File: IO.py Progetto: wkpalan/aigo
def extract_GAF(fileName, G, refSet=None):

    fileName = checkForZip(fileName)
    if (not os.path.exists(fileName)):
        raise IOError(fileName + " does not exist and is required ")

    hasRef = None
    if refSet:
        refRef = dict(izip(refSet, refSet))

    GenetoGO, GOtoGene = dict(), dict()
    for aspect in G.aspect:
        GenetoGO[aspect], GOtoGene[aspect] = dict(), dict()

    data, GAF_col = readGAF_2(fileName)

    for row in data:
        #g=row[GAF_col.index("DB Object Symbol")]
        g = ".".join([
            row[GAF_col.index("Taxon(|taxon)")][6:],
            row[GAF_col.index("DB Object Symbol")]
        ])

        go = row[GAF_col.index('GO ID')]

        if not row[GAF_col.index('Qualifier')].find('NOT') == -1:
            logger.handleWarning(
                "go term %s for gene product %s is qualified as NOT: ignored" %
                (go, g))
            continue

        if hasRef and not hasRef.has_key(g):
            logger.handleWarning(
                "gene product %s is not in the reference set, skip it " % g)
            continue

        if go.find('GO:') == 0:

            go, aspect = G.get_GOAlternative(go, nameSpace=True)

            if not aspect:
                logger.handleWarning("term %s is not in GO graph, skip it " %
                                     go)
                continue

            GenetoGO[aspect].setdefault(g, set([])).add(go)
            GOtoGene[aspect].setdefault(go, set([])).add(g)

    return GenetoGO, GOtoGene
Esempio n. 4
0
File: IO.py Progetto: wkpalan/aigo
def extract_SCOP(fileName, G, refSet=None):
    fileName = checkForZip(fileName)
    if (not os.path.exists(fileName)):
        raise IOError(fileName + " does not exist and is required ")

    hasRef = None
    if refSet:
        hasRef = dict(izip(refSet, refSet))

    GenetoGO, GOtoGene = dict(), dict()
    for aspect in G.aspect:
        GenetoGO[aspect], GOtoGene[aspect] = dict(), dict()

    rd = csv.reader(readFile(fileName), delimiter=";")
    header = rd.next()

    for row in rd:
        #Read gene product id
        g = row[0]

        g = row[header.index('domScop')]
        go = row[header.index('termGo')]

        if hasRef and not hasRef.has_key(g):
            logger.handleWarning(
                "gene product %s is not in the reference set, skip it " % g)
            continue

        if go.find('GO:') == 0:
            #Get the alternative term if any and its GO aspect
            go, aspect = G.get_GOAlternative(go, nameSpace=True)

            if not aspect:
                logger.handleWarning("term %s is not in GO graph, skip it " %
                                     term)
                continue

            GenetoGO[aspect].setdefault(g, set([])).add(go)
            GOtoGene[aspect].setdefault(go, set([])).add(g)

    return GenetoGO, GOtoGene
Esempio n. 5
0
File: OBO.py Progetto: wkpalan/aigo
def readGOoboXML(fileName, force=False, prefix="GO"):
    import cPickle as pickle

    picName = "%s.pic" % fileName
    if (not os.path.exists(picName)):
        force = True

    if not force:
        try:
            logger.info("Reading serialized OBO file : %s" % picName)
            with open(picName, "rb") as f:

                G = pickle.load(f)
                f.close()
        except IOError as (inst):
            print str(type(inst)) + " for " + picName
            force = True
        except EOFError as (inst):  ##its an empty file?
            print str(type(inst)) + " for " + picName
            force = True

    try:
        if force:
            fileName = checkForZip(fileName)
            if (not os.path.exists(fileName)):
                raise IOError(fileName + " does not exist and is required ")

            logger.info("Reading OBO file : %s" % fileName)

            G = get_GOGraph(readFile(fileName, mode="r"), prefix=prefix)
            G.fileName = fileName

            with open(picName, "wb") as f:
                logger.info("Saving serialized OBO file")
                pickle.dump(G, f, -1)
            f.close()
    except Exception, e:
        logger.handleFatal("Unable to read file %s: %s" % (fileName, str(e)))
Esempio n. 6
0
    def add(self, fileName, refType="Fasta"):

        if self.fileName == '':
            self.fileName = fileName
            self.refType = refType
        else:
            if type(self.fileName) == list:
                self.fileName.append(fileName)
                self.refType.append(refType)
            else:
                self.fileName = [self.fileName, fileName]
                self.refType = [self.refType, refType]

        fileName = checkForZip(fileName)
        if (not os.path.exists(fileName)):
            logger.handleFatal(fileName + " does not exist and is required ")

        logger.info("Organism :\t%s" % self.organism)

        logger.info("%s file :\t%s " % (refType, fileName))

        try:

            #Use fasta file to define the reference set
            if refType == "Fasta":
                from Bio import SeqIO
                allID = set([
                    rec.name.split(";")[0].split(":")[-1]
                    for rec in SeqIO.parse(readFile(fileName), "fasta")
                ])
                self.update(allID)

            #Use a simple text file to define the reference set, first column is chosen by default
            elif refType == "Text":
                allID = set([
                    r[0] for r in csv.reader(readFile(fileName), delimiter=";")
                ])
                self.update(allID)

            #Use a GO annotation file to define the reference set
            elif refType == "GAF":
                from AIGO.IO import readGAF_2
                data, GAF_col = readGAF_2(fileName)

                allID = set([
                    ".".join([
                        row[GAF_col.index("Taxon(|taxon)")][6:],
                        row[GAF_col.index("DB Object Symbol")]
                    ]) for row in data
                ])
                self.update(allID)

            #Use a Affymetrix annotation file to define the reference set
            elif refType == "AFFY":
                f = readFile(fileName)
                row = f.readline()
                while row[0] == '#':
                    row = f.readline()

                header = row
                rd = csv.reader(f)
                allID = set()
                for row in rd:
                    #Read gene product id if not control sequence
                    if ("Control sequence".upper() != row[4].upper()):
                        allID.add(row[0])

                self.update(allID)
            else:
                print "Sorry, unknown file type !!"
                self.extend([])
                raise Exception

            if len(self) == 0:
                logger.handleWarning("No gene products loaded")

        except Exception, e:
            logger.handleFatal("Unable to read file %s: %s" %
                               (fileName, str(e)))