print line line = infile.readline().rstrip() coord = [] field = line.split("\t") for item in field: if not item == '': coord.append(item) locus1 = coord[0] refCoord_start1 = int(coord[1]) refCoord_end1 = int(coord[2]) coord1_start = int(coord[3]) coord1_end = int(coord[4]) if coord1_start > coord1_end: startingScaffold = biomodule.reverseComplement(sequences[locus1]) else: startingScaffold = sequences[locus1] step = 1 halfSteps = open("halpfSteps.fasta", "w") while True: line = infile.readline().rstrip() if not line: break coord = [] field = line.split("\t") for item in field: if not item == '': coord.append(item) locus2 = coord[0]
def greedyElongation(seq): unmappedSequences = {} reference = str(seq) os.system( installationDirectory + "src/conda/bin/cd-hit-est -d 0 -i unmapped.fasta -o unmapped_cdhit.fasta >null 2>&1" ) cdhitfile = open("unmapped_cdhit.fasta.clstr") #Select only high representated unmapped reads unmapSeq = {} for seq_record in SeqIO.parse("unmapped.fasta", "fasta"): locus = str(seq_record.id) if not locus in unmapSeq: unmapSeq[locus] = str(seq_record.seq) clusters = {} numSeqInCluster = {} numCluster = 0 line = cdhitfile.readline().rstrip() if not line: print("Do nothing") else: while True: clusterName = "cluster" + str(numCluster) if not clusterName in clusters: clusters[clusterName] = [] numSeqInCluster[clusterName] = 0 line = "start" while not line[0] == ">": line = cdhitfile.readline().rstrip() if not line: break numSeqInCluster["cluster" + str(numCluster)] += 1 clusters[clusterName].append( ((line.split(">"))[1].split("..."))[0]) if line[0] == ">": numCluster += 1 if not line: break biggerCluster = "" seqInBiggerScaffold = 0 for item in numSeqInCluster: if numSeqInCluster[item] > seqInBiggerScaffold: seqInBiggerScaffold = numSeqInCluster[item] biggerCluster = item print("Bigger cluster", biggerCluster, "Size", seqInBiggerScaffold) #sys.stdin.read(1) outcdhitfile = open("tempCdhitFile", "w") if not len(clusters[biggerCluster]) == 0: for sequ in clusters[biggerCluster]: if sequ in unmapSeq: outcdhitfile.write(">" + sequ + "\n" + unmapSeq[sequ] + "\n") outcdhitfile.close() os.system("mv tempCdhitFile unmapped.fasta") for seq_record in SeqIO.parse("unmapped.fasta", "fasta"): locus = str(seq_record.id) if not locus in unmappedSequences: unmappedSequences[locus] = str(seq_record.seq) numElong = 0 while True: toAssemble = open("toAssemble.fasta", "w") toAssemble.write(">toElong\n" + reference[-200:] + "\n") os.system( installationDirectory + "src/conda/bin/makeblastdb -in toElong.fasta -dbtype nucl >null 2>&1" ) os.system( installationDirectory + "src/conda/bin/blastn -query unmapped.fasta -db toElong.fasta -outfmt 6 -num_threads 10 -dust no -soft_masking false -out outputBlast.txt >null 2>&1 " ) blastFile = open("outputBlast.txt") while True: line = blastFile.readline().rstrip() if not line: break fields = line.split("\t") if (int(fields[8]) > (len(reference) - 100) or int(fields[9]) > (len(reference) - 100) ) and abs(int(fields[9]) - int(fields[8])) > 40 and float( fields[2]) > 95: # and fields[5] == "0": if int(fields[8]) > int(fields[9]): toAssemble.write(">" + fields[0] + "\n" + biomodule.reverseComplement( unmappedSequences[fields[0]]) + "\n") else: toAssemble.write(">" + fields[0] + "\n" + unmappedSequences[fields[0]] + "\n") toAssemble.close() print("Perform phrap assembly step....") os.system(installationDirectory + "src/conda/bin/cap3 toAssemble.fasta > cap3Assembly 2>null") numElong += 1 longestScaffold = "" for seq_record in SeqIO.parse("toAssemble.fasta.cap.contigs", "fasta"): if len(str(seq_record.seq)) >= len(longestScaffold): longestScaffold = str(seq_record.seq) #Check whether the produced elonged scaffold is on the right orientation tempScaffold = open("tempScaffold.fasta", "w") tempScaffold.write(">tempScaffold\n" + longestScaffold + "\n") tempScaffold.close() tempScaffold = open("tempScaffold_query.fasta", "w") tempScaffold.write(">reference\n" + reference[-100:] + "\n") tempScaffold.close() os.system( installationDirectory + "src/conda/bin/makeblastdb -in tempScaffold.fasta -dbtype nucl >null 2>&1" ) os.system( installationDirectory + "src/conda/bin/blastn -query tempScaffold_query.fasta -db tempScaffold.fasta -outfmt 6 -num_threads 10 -dust no -soft_masking false -out tempScaffold_outputBlast.txt >null 2>&1" ) tempScaffold = open("tempScaffold_outputBlast.txt") line = tempScaffold.readline().rstrip() fieldBlast = line.split("\t") if len(fieldBlast) > 2: if int(fieldBlast[8]) > int(fieldBlast[9]): longestScaffold = biomodule.reverseComplement(longestScaffold) tempScaffold.close() else: print("WARNING! EXTENSION STOPPED FOR MISSING ELONGMENT!!") js = open( "joined_W_WARNING_" + sequenceToElong + "_" + sequenceToReach, "w") js.write(">joined_W_WARNING_" + sequenceToElong + "_" + sequenceToReach + "\n" + startingSeq[:-1800] + reference) js.close() exit() sc = fuseSequences2(reference, longestScaffold) longestScaffold = sc print("Elonged sequence has now a size of", len(longestScaffold), "nucleotides") if len(longestScaffold) <= len(reference): return longestScaffold else: reference = longestScaffold toElong = open("toElong.fasta", "w") toElong.write(">toElong\n" + reference + "\n") toElong.close()
if not line: break coord = [] field = line.split("\t") for item in field: if not item == '': coord.append(item) locus2 = coord[0] coord2_start = int(coord[3]) coord2_end = int(coord[4]) tempAssembly = open("tempAssembly.fasta", 'w') if coord2_start > coord2_end: tempAssembly.write(">PartialGenome" + "\n" + startingScaffold + "\n" + ">" + locus2 + "\n" + biomodule.reverseComplement(sequeunces[locus2]) + "\n") else: tempAssembly.write(">PartialGenome" + "\n" + startingScaffold + "\n" + ">" + locus2 + "\n" + sequeunces[locus2] + "\n") tempAssembly.close() os.system("phrap tempAssembly.fasta") numSeq = 0 for seq_record in SeqIO.parse("tempAssembly.fasta.contigs", "fasta"): startingScaffold = str(seq_record.seq) numSeq += 1 halfSteps.write(">" + str(step) + "\n" + startingScaffold + "\n") if numSeq > 1: print "More than one seq"
os.system( installationDirectory + "/src/conda/bin/python joinScaffolds_careful.py join ../1_cleanReads/qualityFiltered_1.fq ../1_cleanReads/qualityFiltered_2.fq finalScaffold_1_2000_f.txt r finalScaffold_" + str(bestPos1) + "_" + str(bestPos2) + "_r.txt r " + installationDirectory + " 8") for seq_record in SeqIO.parse(genomeToComplete, "fasta"): genomeToCompleteSeq = str(seq_record.seq) if os.path.isfile("joined_finalScaffold_1_2000_f.txt_finalScaffold_" + bestPos1 + "_" + bestPos2 + "_r.txt") == True: print("5' end successfully reconstructed!") for seq_record in SeqIO.parse( "joined_finalScaffold_1_2000_f.txt_finalScaffold_" + bestPos1 + "_" + bestPos2 + "_r.txt", "fasta"): firstPortion = str(seq_record.seq) firstPortion = bm.reverseComplement(firstPortion) firtPortionReconstructed = fuseSequences2(firstPortion, genomeToCompleteSeq) if len(firtPortionReconstructed) > 10: print("firstPortion successuffly joined!") genomeToCompleteSeq = firtPortionReconstructed outfile = open("newGenome1.fasta", "w") outfile.write(">finalScaffold\n" + firtPortionReconstructed + "\n") outfile.close() else: print("firstPortion not joined") outfile = open("newGenome1.fasta", "w") outfile.write(">finalScaffold\n" + genomeToCompleteSeq + "\n") outfile.close()
if len(longestScaffold) <= len(reference): return longestScaffold else: reference = longestScaffold toElong = open("toElong.fasta", "w") toElong.write(">toElong\n" + reference + "\n") toElong.close() sequences = {} for seq_record in SeqIO.parse(sequenceToElong, "fasta"): startingSeq = str(seq_record.seq) id1 = str(seq_record.id) if sequenceToElongOrientation == "r": startingSeq = biomodule.reverseComplement(startingSeq) for seq_record in SeqIO.parse(sequenceToReach, "fasta"): terminiSeq = str(seq_record.seq) id2 = str(seq_record.id) if sequenceToReachOrientation == "r": terminiSeq = biomodule.reverseComplement(terminiSeq) termfile = open("termini.fasta", "w") termfile.write(">termini\n" + terminiSeq[:500] + "\n") termfile.close() toElong = open("toElong.fasta", "w") toElong.write(">toElong\n" + startingSeq[-1800:-300] + "\n") toElong.close()
downstreamAlignment = fields lastNucl = int(fields[9]) if len(downstreamAlignment) > 0: newSequence = s1[:int(downstreamAlignment[9] )] + s2[int(downstreamAlignment[7]):] blastFile.close() return newSequence else: return "" for seq_record in SeqIO.parse(start, "fasta"): startSeq = str(seq_record.seq) if start_o == "r": startSeq = biomodule.reverseComplement(startSeq) for seq_record in SeqIO.parse(end, "fasta"): terminiSeq = str(seq_record.seq) if end_o == "r": terminiSeq = biomodule.reverseComplement(terminiSeq) elongedSequence = startSeq[-700:-200] outputSeq = open("joinScaffold_trivialSeq.fasta", "w") numCycle = 0 while True: bestElongation = 0 numCycle += 1 if numCycle == numCycles: outputSeq.write(">trivialSeq\n" + startSeq[:-700] + elongedSequence + "\n")
print "Lunghezza migliore Scaffold:",lengthBestScaffold print "Overhang:",overhang #Check forward contigs if reference[-15:] in sequence and (len(fuseSequences(reference,sequence))-len(reference)) > overhang: elongedSequence = fuseSequences(reference,sequence) overhang = len(elongedSequence)-len(reference) print "Sequence",elongedSequence print "Forward" print "Overhang",len(elongedSequence)-len(reference) #print "Dove si trova la sequenza ",sequence.find(reference[-15:]) #print "Da cercare ",reference[-15:] print seq_record #sys.stdin.read(1) #Check reverse contigs revSequence = biomodule.reverseComplement(sequence) if reference[-15:] in revSequence and (len(fuseSequences(reference,revSequence))-len(reference)) > overhang: elongedSequence = fuseSequences(reference,revSequence) overhang = len(elongedSequence)-len(reference) print "Sequence",elongedSequence print "Reverse" print "Overhang",len(elongedSequence)-len(reference) #print "Dove si trova la sequenza ",sequence.find(reference[-15:]) #print "Da cercare ",reference[-15:] print seq_record #sys.stdin.read(1) if overhang < 10: print "Poor elongment. Now exit...." os.system("cp toElong.fasta elonged.fasta")
def greedyElongation(seq): unmappedSequences = {} reference = str(seq) #os.system("mkdir tempor") for seq_record in SeqIO.parse("unmapped.fasta", "fasta"): locus = str(seq_record.id) if not locus in unmappedSequences: unmappedSequences[locus] = str(seq_record.seq) numElong = 0 while True: #print "Perform the blast of the unmapped sequences...." toAssemble = open("toAssemble.fasta", "w") toAssemble.write(">toElong\n" + reference[-200:] + "\n") os.system( installationDirectory + "src/conda/bin/makeblastdb -in toElong.fasta -dbtype nucl >null 2>&1" ) os.system( installationDirectory + "src/conda/bin/blastn -query unmapped.fasta -db toElong.fasta -outfmt 6 -num_threads 8 -dust no -soft_masking false -out outputBlast.txt >null 2>&1" ) #print "Done" #print "Fill the toAssemble file" blastFile = open("outputBlast.txt") while True: line = blastFile.readline().rstrip() if not line: break fields = line.split("\t") if (int(fields[8]) > (len(reference) - 100) or int(fields[9]) > (len(reference) - 100) ) and abs(int(fields[9]) - int(fields[8])) > 40 and float( fields[2]) > 97 and fields[5] == "0": if int(fields[8]) > int(fields[9]): toAssemble.write(">" + fields[0] + "\n" + biomodule.reverseComplement( unmappedSequences[fields[0]]) + "\n") else: toAssemble.write(">" + fields[0] + "\n" + unmappedSequences[fields[0]] + "\n") toAssemble.close() print("Perform second phrap assembly step.......") os.system(installationDirectory + "src/conda/bin/cap3 toAssemble.fasta > cap3Assembly 2>null") numElong += 1 longestScaffold = "" for seq_record in SeqIO.parse("toAssemble.fasta.cap.contigs", "fasta"): if len(str(seq_record.seq)) >= len(longestScaffold): longestScaffold = str(seq_record.seq) #Check whether the produced elonged scaffold is on the right orientation tempScaffold = open("tempScaffold.fasta", "w") tempScaffold.write(">tempScaffold\n" + longestScaffold + "\n") tempScaffold.close() tempScaffold = open("tempScaffold_query.fasta", "w") tempScaffold.write(">reference\n" + reference[-100:] + "\n") tempScaffold.close() os.system( installationDirectory + "src/conda/bin/makeblastdb -in tempScaffold.fasta -dbtype nucl >null 2>&1" ) os.system( installationDirectory + "src/conda/bin/blastn -query tempScaffold_query.fasta -db tempScaffold.fasta -outfmt 6 -num_threads 10 -dust no -soft_masking false -out tempScaffold_outputBlast.txt >null 2>&1" ) tempScaffold = open("tempScaffold_outputBlast.txt") line = tempScaffold.readline().rstrip() fieldBlast = line.split("\t") if len(fieldBlast) > 2: if int(fieldBlast[8]) > int(fieldBlast[9]): longestScaffold = biomodule.reverseComplement(longestScaffold) tempScaffold.close() else: print("WARNING! EXTENSION STOPPED!!") js = open( "joined_W_WARNING_" + sequenceToElong + "_" + sequenceToReach, "w") js.write(">joined_W_WARNING_" + sequenceToElong + "_" + sequenceToReach + "\n" + startingSeq[:-1800] + reference) js.close() exit() sc = fuseSequences2(reference, longestScaffold) longestScaffold = sc print("Elonged sequence has now a size of", len(longestScaffold), "nucleotides") if len(longestScaffold) <= len(reference): return longestScaffold else: reference = longestScaffold toElong = open("toElong.fasta", "w") toElong.write(">toElong\n" + reference + "\n") toElong.close()
for seq_record in SeqIO.parse( outputFolder + "/" + sampleName + "/hcmv_genome.fasta_con.fasta", "fasta"): consensusSequence = str(seq_record.seq) else: for seq_record in SeqIO.parse(consensusFile, "fasta"): consensusSequence = str(seq_record.seq) #************************************************************************************************************** #Create file with repeat flanking regions fro consensus sequence (this is needed by pipeline step 2)*********** flankingSequencesFile = open("repeatsFlanking.fasta", "w") flankingSequencesFile.write( ">TRLflankingStarting\n" + bm.reverseComplement(consensusSequence[1364:3000]) + "\n") flankingSequencesFile.write( ">TRLflankingEnding\nCCATTCCGGGCCGTGTGCTGGGTCCCCGAGGGGCGGGGGGGTGTTTTCTGCGGGGGGGTGAAATTTGGAGTTGCGTGTGTGGACGGCGACGGCGACTAGTTGCGTGTGCTGCGGTGGGTACGGCGACGGCGAATAAAAGCGACGTGCGGCGCGCACGGCGAAAAGCAGACGCGCGTCTGTGTCTGTTTGAGTCCCCAGGGGACGGCAGCG\n" ) flankingSequencesFile.write(">IRflankingStarting\n" + consensusSequence[192000:193500] + "\n") flankingSequencesFile.write(">IRflankingEnding\n" + consensusSequence[197000:197500] + "\n") flankingSequencesFile.write(">TRSflankingEnding\n" + consensusSequence[232000:233500] + "\n") flankingSequencesFile.write( ">TRLflankingEnding\nCCCGGCCAACACACCCCGACACACCCGGCACACGCCCGCGACACACCCGGCCAACACACCCCGACACACCCGGCACACGCCCGCGACACACCCGCGGCACACCCTGACACACCCGCCACACCCGGCACACACCCACCCCGCCGCGCCCCCGACACACCCCGACCGCCGCCGGTGCGGGACAGGGCT\n" ) flankingSequencesFile.close() os.system("mv repeatsFlanking.fasta ./2_ElongationFlankingRepeats/") #****************************************************************************************************************
sequence[startPosition:]), len(reference), str( seq_record.id) if reference[:20] in sequence: if len(sequence) > lengthBestScaffold: lengthBestScaffold = len(sequence) startPosition = sequence.find(reference[:20]) elongedSequence = sequence[startPosition:] bestScaffold = sequence overhang = len(sequence[startPosition:]) - len(reference) print "Overhang composition:", len( sequence[startPosition:]), len(reference), str( seq_record.id) #check reverse contigs if biomodule.reverseComplement(reference[:20]) in sequence: if len(sequence) > lengthBestScaffold: lengthBestScaffold = len(sequence) startPosition = (biomodule.reverseComplement(sequence)).find( reference[:20]) elongedSequence = ( biomodule.reverseComplement(sequence))[startPosition:] bestScaffold = sequence overhang = len((biomodule.reverseComplement(sequence) )[startPosition:]) - len(reference) print "Overhang composition:", len( (biomodule.reverseComplement(sequence) )[startPosition:]), len(reference), str(seq_record.id) if overhang < 10: print "Poor elongment. Now exit...."