def readIds(fname): if os.path.isfile(fname): logging.debug("Reading assigned authorIds from %s" % fname) return maxTables.TableParser(fname).column("authorUniqueName") else: logging.warn("Cannot find %s" % fname) return set()
def filter(compBlastFile, genomeWeights, helperTaxons, bestTaxons, geneWeights, genomeFeatureFile, geneFeatureFile, bestGenomeFile, bestGeneFile, maxDist): """ combined genome/gene processing """ fieldParser = maxTables.TableParser( fileObj=compBlastFile, fileType="blastConvert") # create field name converter blockReader = maxTables.BlockReader( compBlastFile, 0) # split file on first field (=pmcId) bestGenomePredictor = mostMatchesWeightedPredictor(genomeWeights, helperTaxons, bestTaxons) docCount = 0 for (pmcId, block) in blockReader.readNext(): docCount += 1 if docCount % 1000 == 0: logging.info("Processed %d documents" % docCount) pmcId = int(pmcId) recs = fieldParser.parseBlock(block) # keep only best hits bestHits = removeNonOptHits(pmcId, recs) if len(bestHits) == 0: # everything matched univec logger.debug("docId %s: everything seems to match univec" % str(pmcId)) continue bestGenomeInfo = bestGenomePredictor.getGenomeId(pmcId, bestHits) if bestGenomeInfo == None: logger.debug("no best genome found in best hits, trying all hits") bestGenomeInfo = bestGenomePredictor.getGenomeId( pmcId, recs) # shouldn't this be bestHits instead of recs? bestHits = recs if bestGenomeInfo == None: logger.debug("no best genome found, giving up") continue # keep only hits from best genome and separate them into genes/genomes genomeHits, geneHits = filterSplitHits(bestHits, bestGenomeInfo.bestGenome) # chain and disambiguate chains genomeFeatures = disambiguateChains( chainHitsGenomes(genomeHits, maxDist=maxDist)) rawGeneFeatures = disambiguateChains(chainHitsGenes(geneHits)) # disambiguate genes bestGenes, geneFeatures = filterGeneFeatures(rawGeneFeatures, geneWeights) # write to output files, only if we have some supporting evidence if len(genomeFeatures) > 0 or len(bestGenes) > 0: writeTsvFeatures(genomeFeatureFile, pmcId, genomeFeatures) writeTsvFeatures(geneFeatureFile, pmcId, geneFeatures) writeTuples(bestGeneFile, pmcId, bestGenes) writeTuples(bestGenomeFile, pmcId, [bestGenomeInfo.bestGenome])
def filterProtein(compBlastFile, proteinTableFilename): fieldParser = maxTables.TableParser( fileObj=compBlastFile, fileType="blastConvert") # create field name converter blockReader = maxTables.BlockReader( compBlastFile, 0) # split file on first field (=pmcId) protTable = open(proteinTableFilename, "w") headers = [ "#documentId", "sequenceId", "best match in PDB", "BLAST score\n" ] protTable.write("\t".join(headers)) for (pmcId, block) in blockReader.readNext(): pmcId = int(pmcId) recs = fieldParser.parseBlock(block) bestHits = removeNonOptHits(pmcId, recs) for bh in bestHits: data = [str(bh.pmcId), str(bh.seqId), bh.chrom, str(bh.score)] protTable.write("\t".join(data) + "\n")
def convertBlastFiles(blastDirs, genomeToTaxFile, tempFilename, outFilename, fileFormat): """ collect all *.blast files from blastDirs and subdirs, map to taxon ids using genomToTaxFile and write results to outfh fileobject fileFormat can be blat or blast """ outfh = maxbio.openFile(tempFilename, "w") # read genome -> taxid map if genomeToTaxFile: orgToNum = {} logger.info("Reading genome -> number table from %s" % genomeToTaxFile) logger.info( "Expected input fields are: (GenomeName, other field,..., other field, genomeId)" ) for l in open(genomeToTaxFile): if l.startswith("#"): continue fs = l.strip().split("\t") genome = fs[0] num = fs[-1] num = int(num) genome = genome.lower().replace(" ", "_") orgToNum[genome] = num else: logger.info("No genome map specified, results are not genome-based") orgToNum = None dropFileCount = 0 lineCount = 0 finishedTaxons = set() lastDir = None # read blast files if fileFormat == "blast": ext = ".blast" elif fileFormat == "blat": ext = ".psl" elif fileFormat == "bwa": ext = ".sam" else: assert (False) # wrong file format parameter for blastDir in blastDirs: logger.info("Searching for files with extension %s in directory %s" % (ext, blastDir)) files = list(util.findSubdirFiles(blastDir, ext)) logger.info("Found %d files" % len(files)) # convert blast files files = list(files) for fname in files: # convert organism to taxid, skip if not possible dirname = os.path.dirname(fname) org = os.path.basename(dirname) org = org.lower() if orgToNum != None: if org in orgToNum: orgNum = orgToNum[org] else: # try to find any organism from genome list in filename found = False for dbOrg in orgToNum: if dbOrg.replace(" ", "_").lower() in fname.lower(): orgNum = orgToNum[dbOrg] logger.info( "Found orgName %s in filename %s, using organism id %s" % (dbOrg, fname, str(orgNum))) found = True break if not found: logger.warn( "warning: could not resolve filename %s to taxid, dropping this file (recognized organism %s)" % (fname, org)) dropFileCount += 1 continue else: orgNum = -1 # check if not already processed AND in different directory (blast creates several indices per directory), skip if yes #if orgNum in finishedTaxons and dirname!=lastDir: #print("warning: already processed this taxon id %d, skipping input file %s)" % (orgNum, fname)) #continue finishedTaxons.add(orgNum) lastDir = dirname # convert lines f = open(fname, "r") #print "Reading %s, writing hits to %s"%(fname,outFile) if orgNum != -1: logger.info("Reading %s, genomeID %d" % (fname, orgNum)) else: logger.info("Reading %s, not linked to any genome id" % (fname)) if fileFormat == "bwa": tp = maxTables.TableParser(fileType="sam") for l in f: lineCount += 1 # parse blast line fs = l.strip().split("\t") if fileFormat == "blast": # example # 11495631 chr1 100.00 23 0 0 1 23 25500772 25500750 2e-05 46.1 srcId, trgId, perc, length, dummy, dummy, dummy, length, trgStart, trgEnd, eVal, score = fs elif fileFormat == "blat": # psl-format from http://genome.ucsc.edu/FAQ/FAQformat.html#format2 # matches - Number of bases that match that aren't repeats # misMatches - Number of bases that don't match # repMatches - Number of bases that match but are part of repeats # nCount - Number of 'N' bases # qNumInsert - Number of inserts in query # qBaseInsert - Number of bases inserted in query # tNumInsert - Number of inserts in target # tBaseInsert - Number of bases inserted in target # strand - '+' or '-' for query strand. For translated alignments, second '+'or '-' is for genomic strand # qName - Query sequence name # qSize - Query sequence size # qStart - Alignment start position in query # qEnd - Alignment end position in query # tName - Target sequence name # tSize - Target sequence size # tStart - Alignment start position in target # tEnd - Alignment end position in target # blockCount - Number of blocks in the alignment (a block contains no gaps) # blockSizes - Comma-separated list of sizes of each block # qStarts - Comma-separated list of starting positions of each block in query # tStarts - Comma-separated list of starting positions of each block in target # 23 0 0 0 0 0 0 0 + 11075971|1 2299 2248 2271 scaffold_281 111378 17336 17359 1 23, 2248, 17336, matches, misMatches, repMatches, nCount, qNumInsert, qBaseInsert, tNumInsert, tBaseInsert, strand, qName, qSize, qStart, qEnd, tName, tSize, tStart, tEnd, blockCount, blockSizes, qStarts, tStarts = fs score = matches perc = "%2.2f" % ((float(matches) + float(repMatches)) / (float(matches) + float(misMatches) + float(repMatches)) * 100.0) trgId = tName srcId = qName trgStart = tStart trgEnd = tEnd elif fileFormat == "bwa": if fs[0].startswith("@"): continue if len(fs) == 11: fs.append("") row = tp.parseTuple(fs) tuple = maxTables.samToBed(row) if tuple == None: continue chrom, start, end, name, score, strand = tuple srcId = name trgId = chrom perc = "0" length = start - end trgStart, trgEnd = start, end else: assert (False) # file format not found? trgEnd = int(trgEnd) trgStart = int(trgStart) if trgEnd < trgStart: trgStart, trgEnd = trgEnd, trgStart fs = srcId.split("|") srcId = fs[0] srcSeq = fs[1] pmcId = srcId.replace("PMC", "") pmcId = pmcId.replace(".txt", "") pmcId = pmcId.replace(".pdf", "") data = [ pmcId, str(orgNum), srcSeq, trgId, str(trgStart), str(trgEnd), str(score), str(perc) ] outfh.write("\t".join(data) + "\n") outfh.close() logger.info( "BlastHit output table format is (pmcId, genomeId, seqId, chrom, start, end, score, percentId)" ) logger.info( "blastConvert : blast files dropped because of unresolvable species name %d, filesDropped=%d" % (dropFileCount, dropFileCount)) logger.info("blastConvert : processed %d blast matches, blastMatches=%d" % (lineCount, lineCount)) logger.info("Now sorting the file with the UNIX sort command") cmdLine = "sort -n %s | uniq > %s" % (tempFilename, outFilename) logger.info(cmdLine) ret = os.system(cmdLine) if ret == 0: logger.info("Sorting finished, no error") else: logger.info("Error occured while sorting")