예제 #1
0
파일: IO.py 프로젝트: wkpalan/aigo
def extract_Affy(fileName,
                 G,
                 refSet=None,
                 GO_columns=[30, 31, 32],
                 filetype="Affy",
                 delimiter=',',
                 quoting=csv.QUOTE_ALL):
    fileName = checkForZip(fileName)
    if (not os.path.exists(fileName)):
        raise IOError(fileName + " does not exist and is required ")

    #sniff and seek dialect
    csvfile = readFile(fileName)

    hasRef = None
    if refSet:
        hasRef = dict(izip(refSet, refSet))

    GenetoGO, GOtoGene = dict(), dict()
    for aspect in G.aspect:
        GenetoGO[aspect], GOtoGene[aspect] = dict(), dict()

    #Skip comments
    row = csvfile.readline()
    while row[0] == '#':
        row = csvfile.readline()

    #Read Header
    header = row
    #rd=list(csv.reader(f))

    csv.register_dialect('format', delimiter=delimiter, quoting=quoting)
    rd = csv.reader(csvfile, dialect='format')
    for row in rd:
        #Read gene product id
        g = row[0]

        if hasRef and not hasRef.has_key(g):
            logger.handleWarning(
                "gene product %s is not in the reference set, skip it " % g)
            continue

        for aspect, i in zip(
            ['biological_process', 'cellular_component', 'molecular_function'],
                GO_columns):
            for item in row[i].split('///'):
                if not item == "---" and not len(item.strip()) == 0:
                    go = "GO:%07d" % int(item.split('//')[0].replace('/', ''))

                    go, aspect = G.get_GOAlternative(go, nameSpace=True)
                    if not aspect:
                        logger.handleWarning(
                            "term %s is not in GO graph, skip it " % go)
                        continue
                    GenetoGO[aspect].setdefault(g, set([])).add(go)
                    GOtoGene[aspect].setdefault(go, set([])).add(g)

    return GenetoGO, GOtoGene
예제 #2
0
파일: IO.py 프로젝트: wkpalan/aigo
def extract_SCOP(fileName, G, refSet=None):
    fileName = checkForZip(fileName)
    if (not os.path.exists(fileName)):
        raise IOError(fileName + " does not exist and is required ")

    hasRef = None
    if refSet:
        hasRef = dict(izip(refSet, refSet))

    GenetoGO, GOtoGene = dict(), dict()
    for aspect in G.aspect:
        GenetoGO[aspect], GOtoGene[aspect] = dict(), dict()

    rd = csv.reader(readFile(fileName), delimiter=";")
    header = rd.next()

    for row in rd:
        #Read gene product id
        g = row[0]

        g = row[header.index('domScop')]
        go = row[header.index('termGo')]

        if hasRef and not hasRef.has_key(g):
            logger.handleWarning(
                "gene product %s is not in the reference set, skip it " % g)
            continue

        if go.find('GO:') == 0:
            #Get the alternative term if any and its GO aspect
            go, aspect = G.get_GOAlternative(go, nameSpace=True)

            if not aspect:
                logger.handleWarning("term %s is not in GO graph, skip it " %
                                     term)
                continue

            GenetoGO[aspect].setdefault(g, set([])).add(go)
            GOtoGene[aspect].setdefault(go, set([])).add(g)

    return GenetoGO, GOtoGene
예제 #3
0
파일: OBO.py 프로젝트: wkpalan/aigo
def readGOoboXML(fileName, force=False, prefix="GO"):
    import cPickle as pickle

    picName = "%s.pic" % fileName
    if (not os.path.exists(picName)):
        force = True

    if not force:
        try:
            logger.info("Reading serialized OBO file : %s" % picName)
            with open(picName, "rb") as f:

                G = pickle.load(f)
                f.close()
        except IOError as (inst):
            print str(type(inst)) + " for " + picName
            force = True
        except EOFError as (inst):  ##its an empty file?
            print str(type(inst)) + " for " + picName
            force = True

    try:
        if force:
            fileName = checkForZip(fileName)
            if (not os.path.exists(fileName)):
                raise IOError(fileName + " does not exist and is required ")

            logger.info("Reading OBO file : %s" % fileName)

            G = get_GOGraph(readFile(fileName, mode="r"), prefix=prefix)
            G.fileName = fileName

            with open(picName, "wb") as f:
                logger.info("Saving serialized OBO file")
                pickle.dump(G, f, -1)
            f.close()
    except Exception, e:
        logger.handleFatal("Unable to read file %s: %s" % (fileName, str(e)))
예제 #4
0
파일: IO.py 프로젝트: wkpalan/aigo
def readGAF_2(fileName):
    GAF_col = [
        "DB", "DB Object ID", "DB Object Symbol", "Qualifier", "GO ID",
        "DB:Reference", "Evidence Code", "With (or) From", "Aspect",
        "DB Object Name", "DB Object Synonym", "DB Object Type",
        "Taxon(|taxon)", "Date", "Assigned By", "Annotation Extension",
        "Gene Product Form ID"
    ]

    #Read the entire file
    data = [row for row in csv.reader(readFile(fileName), delimiter="\t")]

    #Read the header
    seek = 0
    GAF_OK = False
    while data[seek][0][0] == "!":
        if re.search("!.*gaf-version.*:.*2", data[seek][0]):
            GAF_OK = True
        seek = seek + 1

    if not GAF_OK:
        raise Exception("Sorry, GAF format version 2.0 expected.")

    return iter(data[seek:]), GAF_col
예제 #5
0
    def add(self, fileName, refType="Fasta"):

        if self.fileName == '':
            self.fileName = fileName
            self.refType = refType
        else:
            if type(self.fileName) == list:
                self.fileName.append(fileName)
                self.refType.append(refType)
            else:
                self.fileName = [self.fileName, fileName]
                self.refType = [self.refType, refType]

        fileName = checkForZip(fileName)
        if (not os.path.exists(fileName)):
            logger.handleFatal(fileName + " does not exist and is required ")

        logger.info("Organism :\t%s" % self.organism)

        logger.info("%s file :\t%s " % (refType, fileName))

        try:

            #Use fasta file to define the reference set
            if refType == "Fasta":
                from Bio import SeqIO
                allID = set([
                    rec.name.split(";")[0].split(":")[-1]
                    for rec in SeqIO.parse(readFile(fileName), "fasta")
                ])
                self.update(allID)

            #Use a simple text file to define the reference set, first column is chosen by default
            elif refType == "Text":
                allID = set([
                    r[0] for r in csv.reader(readFile(fileName), delimiter=";")
                ])
                self.update(allID)

            #Use a GO annotation file to define the reference set
            elif refType == "GAF":
                from AIGO.IO import readGAF_2
                data, GAF_col = readGAF_2(fileName)

                allID = set([
                    ".".join([
                        row[GAF_col.index("Taxon(|taxon)")][6:],
                        row[GAF_col.index("DB Object Symbol")]
                    ]) for row in data
                ])
                self.update(allID)

            #Use a Affymetrix annotation file to define the reference set
            elif refType == "AFFY":
                f = readFile(fileName)
                row = f.readline()
                while row[0] == '#':
                    row = f.readline()

                header = row
                rd = csv.reader(f)
                allID = set()
                for row in rd:
                    #Read gene product id if not control sequence
                    if ("Control sequence".upper() != row[4].upper()):
                        allID.add(row[0])

                self.update(allID)
            else:
                print "Sorry, unknown file type !!"
                self.extend([])
                raise Exception

            if len(self) == 0:
                logger.handleWarning("No gene products loaded")

        except Exception, e:
            logger.handleFatal("Unable to read file %s: %s" %
                               (fileName, str(e)))