Ejemplo n.º 1
0
def getFile(path, logger=None, tmp=None):
    ensureTmpIsPresent(tmp, logger=logger)

    logDebug(logger, "Getting file {}".format(path))

    isValidFile = os.path.isfile(path)
    if isValidFile:
        logDebug(logger, "Path is a valid file.")
        return path
    else:
        logDebug(logger, "Path is not a valid file path")

    isValidUrl = True
    try:
        validateUrl(path)
    except:
        isValidUrl = False

    if isValidUrl:
        logDebug(logger,
                 "Path seems like a fine url. Try to download the file")
        downloadPath = os.path.join(tmp, "download.fasta")
        logInfo(logger, "Downloading file {} from remote source".format(path))
        urllib.request.urlretrieve(path, downloadPath)
        return downloadPath
    else:
        logError(
            logger,
            "The given path {} is not a valid url nor points to any local file."
            .format(path))
        exit(1)
Ejemplo n.º 2
0
def makeHMMScanRequest(
        seq,
        seqID=None,
        logger=None,
        hmmUrl="https://www.ebi.ac.uk/Tools/hmmer/search/hmmscan",
        queryParams={},
        queryHeaders={}):
    if seqID:
        logDebug(logger, "Doing HMMScan on seqence \"{}\"...".format(seqID))

    defaultParams = {
        "hmmdb": (None, "pfam"),
        "seq": (None, seq),
        "threshold": (None, "cut_ga")
    }
    defaultHeaders = {
        "Accept": "text/xml",
        "User-Agent": "Mozilla/5.0",
    }

    s = requests.Session()
    r = requests.Request('POST',
                         hmmUrl,
                         files={
                             **defaultParams,
                             **queryParams
                         },
                         headers={
                             **defaultHeaders,
                             **queryHeaders
                         }).prepare()
    answer = s.send(r)
    answerContent = answer.content.decode("utf-8")

    if seqID:
        logDebug(logger, "Done HMMScan on seqence \"{}\"".format(seqID))

    # Replace invalid tags
    if answerContent:
        answerContent = re.sub(r'<\d+ H=.*\/>', '', answerContent)

    xmlTree = None
    try:
        xmlTree = ET.fromstring(answerContent)
    except:
        logError(
            logger,
            "Failed HMMScan on sequence \"{}\" - Invalid response ".format(
                seqID))
        logError(logger, "xml [{}]".format(answerContent))
        raise Exception("XML Parse failed")

    if seqID:
        return {"seqID": seqID, "result": xmlTree}
    return xmlTree
Ejemplo n.º 3
0
def ensureTmpIsPresent(tmp, logger=None):
    if not os.path.exists(tmp):
        os.makedirs(tmp)
    else:
        logInfo(logger, "Clearing temporary download directory")
        for the_file in os.listdir(tmp):
            file_path = os.path.join(tmp, the_file)
            try:
                if os.path.isfile(file_path):
                    os.unlink(file_path)
            except Exception as e:
                logError(logger, e)
Ejemplo n.º 4
0
def main(logginglevel, input, tmp):
    logger = createLogger(__file__)
    logger = setLoggerLevel(logger, logginglevel)

    inputCount = len(input)
    if inputCount != 2:
        logError(
            logger,
            "Invalid files number was specified. Please provide exactly two input files."
        )
        exit(1)

    inputA = input[0]
    inputB = input[1]
    logInfo(logger, "Fisher test of {} compared to {}".format(inputA, inputB))

    tableB = loadMatrixFromCSV(inputA, logger=logger)
    tableA = loadMatrixFromCSV(inputB, logger=logger)
    (domainNames, tables) = normalizeTables([tableA, tableB], logger=logger)

    domainPsCustom = [
        round(v, 2)
        for v in customFisher(domainNames, tables[0], tables[1], logger=logger)
    ]
    domainPsFisher = [
        round(v, 2)
        for v in scipyFisher(domainNames, tables[0], tables[1], logger=logger)
    ]

    results = [[r[0], r[1], r[2]]
               for r in zip(domainNames, domainPsCustom, domainPsFisher)]
    results.sort(key=lambda x: x[1])

    resultsT = [[row[i] for row in results] for i in range(3)]
    table = go.Table(
        header=dict(values=['Domain', 'Custom Fisher', 'SciPy Fisher']),
        cells=dict(values=resultsT))

    data = [table, bars]
    py.offline.plot({"data": data}, auto_open=False)
Ejemplo n.º 5
0
def entrezRetrieveSequence(
        accessionIDs,
        tmp='.',
        entrezUrl='https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi',
        dbName='protein',
        retries=5):

    if len(accessionIDs) <= 0:
        return []

    downloadedFilePath = os.path.join(
        tmp, "etrez.{}.{}.fasta".format(
            accessionIDs[0] + "_batch__" + str(len(accessionIDs)), dbName))

    retryNo = 0
    while retryNo < retries:
        retryNo = retryNo + 1
        try:
            urllib.request.urlretrieve(
                '{}?db={}&id={}&rettype=fasta&retmode=text'.format(
                    entrezUrl, dbName, ",".join(accessionIDs)),
                downloadedFilePath)
            break
        except urllib.error.HTTPError as err:
            if retryNo < retries - 1:
                logError(logger, "Request failed retrying in 5 seconds...")
                time.sleep(5)
            else:
                raise err

    ret = []
    with open(downloadedFilePath, "r") as handle:
        for record in SeqIO.parse(handle, "fasta"):
            ret.append(record)

    return ret
Ejemplo n.º 6
0
def main(logginglevel, input, tmp, idthreshold, evalue):
    logger = createLogger(__file__)
    logger = setLoggerLevel(logger, logginglevel)

    with ThreadPoolExecutor(max_workers=10) as executor:
        loop = asyncio.get_event_loop()

        inputFiles = [
            getFile(inputFile, logger=logger, tmp=tmp) for inputFile in input
        ]
        outputFiles = []
        for file in inputFiles:

            fileOut = file.replace(".fasta", "_ext.fasta")
            logInfo(logger, "Extending file {} to {}".format(file, fileOut))
            seqs = []

            with open(file, "r") as handle:
                for record in SeqIO.parse(handle, "fasta"):
                    seqs.append("> {}\n{}".format(record.id, str(record.seq)))
            queryStr = "\n".join(seqs)

            logInfo(logger, "Requesting web BLASTP search")
            requestOut = qblast("blastp",
                                "nr_v5",
                                queryStr,
                                expect=float(evalue),
                                perc_ident=float(idthreshold)).getvalue()

            root = ET.fromstring(requestOut)

            accessionIDs = []

            hits = root.findall(
                "./BlastOutput_iterations/Iteration/Iteration_hits/Hit")
            logInfo(logger, "Got {} hits for given query".format(len(hits)))
            if len(hits) <= 0:
                logInfo(logger, requestOut)
                raise Exception('No hits found')

            recordsOut = []
            recordCountOut = 0
            for hit in hits:
                seqs = hit.findall("./Hit_hsps/Hsp/Hsp_qseq")
                for seq in seqs:
                    accessionID = hit.find("./Hit_accession").text
                    accessionIDs.append(accessionID)

            gatherTasks = []

            requestsLimit = 3
            chunkSize = int(len(accessionIDs) / requestsLimit) + 1
            accessionChunks = [
                chunk for chunk in chunks(accessionIDs, chunkSize)
            ]
            logDebug(
                logger,
                "Chunked request, will request Entrez for {} batches of size {}."
                .format(len(accessionChunks), chunkSize))

            for accessionIDs in accessionChunks:
                gatherTasks.append(
                    loop.run_in_executor(executor, entrezRetrieveSequence,
                                         *((accessionIDs, tmp))))

            gatherFuture = asyncio.ensure_future(asyncio.gather(*gatherTasks))
            loop.run_until_complete(gatherFuture)

            recordsOut = []
            for records in gatherFuture.result():
                for record in records:
                    recordsOut.append(record.format("fasta"))

            recordCountOut = len(recordsOut)
            logDebug(logger, "Writing to file {}".format(fileOut))
            with open(fileOut, "w") as fileOutHandle:
                fileOutHandle.write("\n".join(recordsOut))

            with open(fileOut, "r") as handle:
                recordCount = 0
                for record in SeqIO.parse(handle, "fasta"):
                    recordCount = recordCount + 1
                if recordCount != recordCountOut:
                    logError(
                        logger,
                        "Got mismatch records count after writing output file. {} records are present in {} and there should be {} reconds."
                        .format(recordCount, fileOut, recordCountOut))
            outputFiles.append(fileOut)

        logInfo(
            logger,
            "Written {} files in total. Returning them to stdout as plaintext paths list"
            .format(len(outputFiles)))
        print("\n".join(outputFiles))