Beispiel #1
0
def fetchUniprotEntries(fasta, newaccs=None):
    print "Fetching uniprot entries.."
    uniprot_file = fasta + ".uniprot"
    already_exists = False
    if os.path.isfile(uniprot_file):
        already_exists = True
        lastline = tail(uniprot_file, 1)
        lastline = lastline[0]
        lastline = lastline.rstrip()

        #TODO: change head linux use to the appropriate python command
        if lastline == "</uniprot>":
            lines = file(uniprot_file, 'r').readlines()
            del lines[-1]
            file(uniprot_file, 'w').writelines(lines)
            #cmd = "head -n -1 " + uniprot_file + " > " + uniprot_file + ".2; " + "mv " + uniprot_file + ".2 " + uniprot_file
            #print cmd
            #os.system(cmd)

    fasta_ids = newaccs
    if newaccs == None:
        fasta_ids = getACCsFromFastaFile(fasta)
    print fasta_ids
    if fasta_ids:
        l = range(0, len(fasta_ids))
        chunks = FetchUtils.getChunks(l, 50)
        for i in range(0, len(chunks)):
            print "Fetching " + str(i) + " chunk out of " + str(len(chunks))
            sys.stdout.flush()
            accs = [fasta_ids[j] for j in chunks[i]]
            # fetchBatch <dbName> <id1,id2,...> [formatName [styleName]] [options...]
            try:
                uniprots = FetchUtils.returnFetchBatch('uniprot', ','.join(accs), 'xml', 'default')
                with open(fasta + ".uniprot", "a") as f:
                    if not i == 0 or already_exists:
                        uniprots = re.sub(r"\<uniprot.+", '', uniprots)
                        uniprots = re.sub(r"\<\?xml version=.+\>", '', uniprots)
                    if not i == len(chunks) - 1:
                        uniprots = re.sub(r"\</uniprot.*>", '', uniprots)
                    f.write(uniprots)
            except:
                print "Coundnot fetch this chunk"
                continue
            else:
                print "succeeded in fetching chunk"
                continue
    return (fasta_ids)
Beispiel #2
0
def main(argv=None):
    global fetchme

    if argv is None:
        argv = sys.argv

    fasta = argv[1]

    if len(argv) > 2:
        if argv[2] == "-noFetch":
            fetchme = False
        else:
            fetchme = True

    if fetchme:
        print "Fetching data.."
    else:
        print "No fetching.."

    FetchUtils.soap_setup()

    print fasta

    proteins = None

    if fetchme:
        if os.path.isfile(fasta + ".uniprot"):
            print "File: " + fasta + ".uniprot data already exists"
            accs = getACCsFromFastaFile(fasta)
        else:
            print "File: " + fasta + ".uniprot data does not exist.\nRetrieving data from EBI.."
            accs = fetchUniprotEntries(fasta)

    proteins = getDataFromUniprot(fasta, getACCsFromFastaFile(fasta))
    # for id, protein in proteins.iteritems():
    # print protein, "\n"


    proteins = poolProteinCDSs(proteins, fasta)
    assignAlignseqsToProteins(proteins, fasta)
Beispiel #3
0
def fetchEntries(reftype, pool, fname, add=True):
    if os.path.isfile(fname) and add == False:
        print "File: " + fname + " data already exists. Reading sequences from file.."
        return 0  # read_cdss(fname)
    else:
        print "File: " + fname + " data does not exist or partial.\nRetrieving relevant data from EBI.."
        l = range(0, len(pool))
        chunks = FetchUtils.getChunks(l, 100)
        print reftype
        if reftype == 'refseqn':
            for i in range(0, len(chunks)):
                print i
                ids = [pool[j] for j in chunks[i]]
                # fetchBatch <dbName> <id1,id2,...> [formatName [styleName]] [options...]
                print ','.join(ids)
                try:
                    entries = None
                    entries = FetchUtils.returnFetchBatch(reftype, str(','.join(ids)), 'default', 'default')
                    with open(fname, "a") as f:
                        if len(entries) > 0:
                            f.write(''.join(str(v) for v in entries))
                        else:
                            print "Nothing to write"
                except suds.WebFault as detail:
                    print "MY DETAILS!!!!!!!!!!!!!!!!!!!!:"
                    print detail

                # entries = returnFetchBatch(reftype, ','.join(ids), 'default', 'default')
                # with open(fname, "a") as f:
                # f.write(entries)
            with open(fname, "r") as f, open(fname + "2", "w") as fo:
                lastline = None
                for line in f:
                    match = re.match("(.+)<a href=.+>(\w+)</a>(.+)", line)
                    if match:
                        fo.write(match.group(1) + match.group(2) + match.group(3) + "\n")
                    else:
                        fo.write(line)
                        lastline = line
                if lastline != "//":
                    fo.write("//")
                fo.flush()
                os.fsync(fo)
            if os.path.isfile(fname + "2"):
                os.remove(fname)
                os.rename(fname + "2", fname)
        elif reftype == 'emblcds':
            for i in range(0, len(chunks)):
                print i
                ids = [pool[j] for j in chunks[i]]
                # fetchBatch <dbName> <id1,id2,...> [formatName [styleName]] [options...]
                print ','.join(ids)
                try:
                    entries = None
                    entries = FetchUtils.returnFetchBatch(reftype, str(','.join(ids)), 'default', 'default')
                    with open(fname, "a") as f:
                        if len(entries) > 0:
                            f.write(''.join(str(v) for v in entries))
                        else:
                            print "Nothing to write"
                except suds.WebFault as detail:
                    print "MY DETAILS!!!!!!!!!!!!!!!!!!!!:"
                    print detail

                # entries = returnFetchBatch(reftype, ','.join(ids), 'default', 'default')
                # with open(fname, "a") as f:
                # f.write(entries)

        elif reftype == 'ensembltranscript':
            l = range(0, len(pool))
            chunks = FetchUtils.getChunks(l, 50)
            print reftype
            DEVNULL = open(os.devnull, 'wb')
            for i in range(0, len(chunks)):
                print "retrieveing: " + str(i) + "\n"
                ids = [pool[j] for j in chunks[i]]
                if _platform == "linux" or _platform == "linux2":
                    sep = '"'
                elif _platform == "darwin":
                    sep = '"'
                elif _platform == "win32":
                    sep = '\\\"'
                myids = (', '.join(sep + item + sep for item in ids))
                if _platform == "linux" or _platform == "linux2":
                    curlCmd = "curl -H'Accept: application/json' -H'Content-type: application/json' -XPOST --data '{\"ids\" : [" + myids + "] }' 'http://rest.ensembl.org/sequence/id?object_type=transcript&type=cds'"
                elif _platform == "darwin":
                    curlCmd = "curl -H'Accept: application/json' -H'Content-type: application/json' -XPOST --data '{\"ids\" : [" + myids + "] }' 'http://rest.ensembl.org/sequence/id?object_type=transcript&type=cds'"
                elif _platform == "win32":
                    curlCmd = "curl -H\"Accept: application/json\" -H\"Content-type: application/json\" -XPOST --data \"{\\\"ids\\\" : [" + myids + "] }\" \"http://rest.ensembl.org/sequence/id?object_type=transcript&type=cds\""
                print curlCmd
                with open(fname + "tmp.dnld2", "w") as f:
                    # print "Writing " + gene + " to " + fname
                    p = subprocess.Popen(
                        curlCmd,
                        shell=True, stdout=f, stderr=DEVNULL)
                    # p = subprocess.Popen("wget -q --header='Content-type:text/x-fasta' 'http://rest.ensembl.org/sequence/id/" + str(gene) + "?type=cds' -O -", shell=True, stdout=f, stderr=subprocess.STDOUT)
                    ret_code = p.wait()
                    with open(fname + "tmp.dnld2", "r") as fo2:
                        myres = json.loads(fo2.read())
                        print myres
                        if (len(myres) > 0):
                            with open(fname, "a") as f:
                                for i_id in range(0, len(myres)):
                                    f.write(">" + myres[i_id]["id"] + "\n" + myres[i_id]["seq"] + "\n")
            DEVNULL.close()
        elif reftype == 'ensemblgenomestranscript':
            l = range(0, len(pool))
            chunks = FetchUtils.getChunks(l, 50)
            print reftype
            DEVNULL = open(os.devnull, 'wb')
            for i in range(0, len(chunks)):
                print "retrieveing: " + str(i) + "\n"
                ids = [pool[j] for j in chunks[i]]
                if _platform == "linux" or _platform == "linux2":
                    sep = '"'
                elif _platform == "darwin":
                    sep = '"'
                elif _platform == "win32":
                    sep = '\\\"'
                myids = (', '.join(sep + item + sep for item in ids))
                if _platform == "linux" or _platform == "linux2":
                    curlCmd = "curl -H'Accept: application/json' -H'Content-type: application/json' -XPOST --data '{\"ids\" : [" + myids + "] }' 'http://rest.ensemblgenomes.org/sequence/id?object_type=transcript&type=cds'"
                elif _platform == "darwin":
                    curlCmd = "curl -H'Accept: application/json' -H'Content-type: application/json' -XPOST --data '{\"ids\" : [" + myids + "] }' 'http://rest.ensemblgenomes.org/sequence/id?object_type=transcript&type=cds'"
                elif _platform == "win32":
                    curlCmd = "curl -H\"Accept: application/json\" -H\"Content-type: application/json\" -XPOST --data \"{\\\"ids\\\" : [" + myids + "] }\" \"http://rest.ensemblgenomes.org/sequence/id?object_type=transcript&type=cds\""
                print curlCmd
                with open(fname + "tmp.dnld2", "w") as f:
                    # print "Writing " + gene + " to " + fname
                    p = subprocess.Popen(
                        curlCmd,
                        shell=True, stdout=f, stderr=DEVNULL)  # subprocess.STDOUT)
                    # p = subprocess.Popen("wget -q --header='Content-type:text/x-fasta' 'http://rest.ensembl.org/sequence/id/" + str(gene) + "?type=cds' -O -", shell=True, stdout=f, stderr=subprocess.STDOUT)
                    ret_code = p.wait()
                    with open(fname + "tmp.dnld2", "r") as fo2:
                        myres = json.loads(fo2.read())
                        print myres
                        if (len(myres) > 0):
                            with open(fname, "a") as f:
                                for i_id in range(0, len(myres)):
                                    f.write(">" + myres[i_id]["id"] + "\n" + myres[i_id]["seq"] + "\n")
            DEVNULL.close()
        else:
            print reftype + " No such reftype."

    return read_cdss(fname)