def by_file(inFile, uargs): try: fasta = pyfasta.Fasta(inFile) except pyfasta.fasta.DuplicateHeaderException: tmpFile = rename_fasta(inFile) fasta = pyfasta.Fasta(tmpFile.name) tmpFile.close() except ValueError, TypeError: msg = 'ERROR: Could not read file: {}' sys.stderr.write(msg.format(inFile) + '\n') return None
def yield_sequence_and_BlastHit(len_seq=500, n_query=10, min_len_hit=20, max_len_hit=40): # Generate a random subject sequence with rand_fasta(len_seq=len_seq, n_seq=1) as subject: pyfasta_gen = pyfasta.Fasta(subject.fasta_path, flatten_inplace=True) seq_record = pyfasta_gen["seq_0"] sequence = Sequence(name="seq_0", seq_record=seq_record) seq_dict = {} # Generate random hits from the subject for i in range(n_query): start = ri(1, len_seq - max_len_hit) end = start + ri(min_len_hit, max_len_hit) seq_dict["query_{}:{}-{}".format(i, start, end)] = str(seq_record[start:end]) with defined_fasta(seq_dict) as query: # Create Blast DB and perform a blast to generate a list of hits with Blastn(subject.fasta_path) as blastn: hit_list = blastn( query.fasta_path, task="blastn", best_query_hit=True, ) yield (sequence, hit_list)
def loadgenome_extradata_fx(fasta_handle, gff3_handle, meta): """ """ genome = pyfasta.Fasta(fasta_handle, key_fn=lambda key: key.split()[0]) gff3 = allel.FeatureTable.from_gff3(gff3_handle) meta = pd.read_csv(meta, delimiter=",") return (genome, gff3, meta)
def _contig_size_list(self, path): """Insert all contig sizes in a list.""" f = pyfasta.Fasta(path) self.contig_sizes = [] for keys in f.keys(): self.contig_sizes.append(len(f[keys])) self.contig_sizes.sort()
def main(argv): parser = argparse.ArgumentParser( description='Interpret FASTA file with Stanford service.') requiredNamed = parser.add_argument_group('required named arguments') requiredNamed.add_argument('-i', '--input', dest='input_file', help='Input file name', required=True) requiredNamed.add_argument('-q', '--query', dest='graphQL_query_file', help='GraphQL query file', required=True) #parser.parse_args(['-h']) results = parser.parse_args(argv) input_file = results.input_file graphQL_query_file = results.graphQL_query_file print generateCSVHeader() f = pyfasta.Fasta(input_file) nrLoops = int(len(f.keys()) / 1000) # Per 1000 sequences, do a Stanford analysis for i in range(0, nrLoops): headers = list(f.keys()[j] for j in range(i * 1000, (i * 1000) + 1000)) pool = mp.Pool(processes=20) results = [ pool.apply_async(doStanfordAnalysis, args=( header, f[header], graphQL_query_file, )) for header in headers ] #tmp = open(output_file, 'a') for p in results: print p.get() # tmp.write(p.get()) #tmp.close() #print nrLoops*1000 + ((nrLoops + len(f.keys())) % 1000) # Do the Stanford analysis for the last sequences available headers = list(f.keys()[j] for j in range(nrLoops * 1000, (nrLoops * 1000 + (len(f.keys()) % 1000)))) pool = mp.Pool(processes=20) results = [ pool.apply_async(doStanfordAnalysis, args=( header, f[header], graphQL_query_file, )) for header in headers ] #tmp = open(output_file, 'a') for p in results: print p.get()
def explainFL(genomeFile, outPrefix, sam): genome = pyfasta.Fasta(genomeFile) samfile = pysam.AlignmentFile(sam, "r") fout = open(outPrefix + 'explainFL.txt', 'w') fout_nop = open(outPrefix + 'explainFL_noprimary.txt', 'w') for read in samfile.fetch(): #if read.mapping_quality < 20: # continue if len(read.cigar) == 0: continue readInfo = getReadInfo(read) leftSeq = genome.sequence({ 'chr': readInfo[1], 'start': readInfo[2] - 1, 'stop': readInfo[2] }).upper() rightSeq = genome.sequence({ 'chr': readInfo[1], 'start': readInfo[3] + 1, 'stop': readInfo[3] + 2 }).upper() readInfo.append(leftSeq) readInfo.append(rightSeq) if read.flag & 256 > 0: fout_nop.write('\t'.join([str(i) for i in readInfo]) + "\n") else: fout.write('\t'.join([str(i) for i in readInfo]) + "\n") samfile.close() fout.close() fout_nop.close()
def generate_index(self): print("Generating samtools index...") sys.stdout.flush() try: subprocess.check_call(['samtools', 'faidx', self.fasta_out]) except subprocess.CalledProcessError: sys.exit("Invalid genome fasta input, please check the source.") print("done\n") # for custom references, validate fasta contig names match definition in contig_defs: # PRIMARY_CONTIGS must be a subset of contig names in genome.fa if self.contig_defs['reference_name'] not in STANDARD_GENOMES: regtools.validate_contig_names(self.fasta_out + '.fai', self.contig_defs) print("Generating pyfasta indexes...") sys.stdout.flush() pyf = pyfasta.Fasta(self.fasta_out, key_fn=lambda x: x.split()[0]) contigs = len(pyf) size = sum(len(pyf[contig]) for contig in pyf) print(" Number of contigs: %d\n Total genome size: %d" % (contigs, size)) print("done\n")
def get_dn_ds_from_fasta(input_fasta, output_prefix): try: os.mkdir(output_prefix) except: pass fasta_in = pyfasta.Fasta(input_fasta) genes = list(fasta_in.keys()) output_dn_ds = OrderedDict() if os.path.basename(input_fasta).startswith("N"): if "permissive" in input_fasta: output_file = os.path.join(output_prefix, os.path.basename(input_fasta).split(".permissive.fasta")[0] + ".permissive.dn_ds") else: output_file = os.path.join(output_prefix, os.path.basename(input_fasta).split(".strict.fasta")[0] + ".strict.dn_ds") else: output_file = os.path.join(output_prefix, os.path.basename(input_fasta).split(".fasta")[0] + ".dn_ds") if os.path.exists(output_file): with open(output_file) as out_f: for line in out_f: line_s = line.split("\t") last_gene = line_s[0] idx = genes.index(last_gene) else: # Do the whole thing idx = 0 with open(output_file, "w") as out_f: #with open(output_file) as out_f_old: # for line in out_f_old: # out_f.write(line) for gene in genes: out_ds = get_dn_ds_from_alignment(input_fasta,these_samples=[gene],do_window=True,gene_name=gene,cbs_reference=False,window=200,step=10, hoffman=True) if out_ds is not None: rows = out_ds out_f.write(str(gene) + "\tOVERALL\t" + str(rows[0][0]) + "\t" + str(rows[0][1]) + "\n") for row in rows[1][gene]: out_f.write(str(gene) + "\tWINDOW\t" + str(row[0]) + "\t" + str(row[1]) + "\n")
def test_Interval_sequence(): genome = pyfasta.Fasta('test/example.fa') l1 = Interval.from_string('1:858-967:1', genome=genome) l2 = Interval.from_string('1:858-967:-1', genome=genome) print l1.sequence print l2.sequence assert l1.sequence != l2.sequence
def liftover(args): """ %prog liftover lobstr_v3.0.2_hg38_ref.bed hg38.upper.fa LiftOver CODIS/Y-STR markers. """ p = OptionParser(liftover.__doc__) p.add_option("--checkvalid", default=False, action="store_true", help="Check minscore, period and length") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) refbed, fastafile = args genome = pyfasta.Fasta(fastafile) edits = [] fp = open(refbed) for i, row in enumerate(fp): s = STRLine(row) seq = genome[s.seqid][s.start - 1:s.end].upper() s.motif = get_motif(seq, len(s.motif)) s.fix_counts(seq) if opts.checkvalid and not s.is_valid(): continue edits.append(s) if i % 10000 == 0: print(i, "lines read", file=sys.stderr) edits = natsorted(edits, key=lambda x: (x.seqid, x.start)) for e in edits: print(str(e))
def _contig_size_dict(self, path): """Find the distribution of contig sizes.""" f = pyfasta.Fasta(path) self.contig_size_dict = {} for keys in f.keys(): self.contig_size_dict[keys] = len(f[keys]) self.contig_size_dict
def count_bases_in_peaks(reference_path, peaks_file): """Count the total number of bases in peak regions (0-indexed)""" bases_in_peaks = 0 ctg_mgr = ReferenceManager(reference_path) genome_fa = pyfasta.Fasta(ctg_mgr.fasta, key_fn=lambda x: x.split()[0]) for peak in peak_reader(peaks_file): bases_in_peaks += len(genome_fa[peak.chrom][peak.start:peak.end]) return bases_in_peaks
def get_barcode_gc(ref_f, peaks_f, matrix): """Get mean GC% of peaks in a barcode""" ref_mgr = ReferenceManager(ref_f) genome_fa = pyfasta.Fasta(ref_mgr.fasta, key_fn=lambda x: x.split()[0]) peak_GC = np.array([get_peak_GC_counts(peak, genome_fa, counts=False) for peak in peak_reader(peaks_f)]) barcode_GC = ((peak_GC * matrix.m) / np.array(matrix.m.sum(axis=0))).squeeze() return barcode_GC
def __init__(self, fasta, motifs_input, bg=None): self.all_motifs = [] with open(motifs_input, "r") as infile: self.all_motifs = list(motifs.parse(infile, "jaspar")) # for large sequence header, only keep the text before the first space self.genome_seq = pyfasta.Fasta(fasta, key_fn=lambda x: x.split()[0]) self.bg = bg
def _get_NrContigs(self, path): """Find the number of contigs in the fasta file.""" try: f = pyfasta.Fasta(path) self.nrContigs = len(f) print("Contigs: " + str(self.nrContigs)) except ValueError: self.nrContigs = 0
def _get_NrContigs(self, path): """Find the number of contigs in the fasta file.""" f = pyfasta.Fasta(path) counter = 0 for header in f: counter += 1 self.nrContigs = counter print("Contigs: " + str(self.nrContigs))
def hints_db(hints_args, toil_options): """ Entry point for hints database Toil pipeline. """ def validate_import_bam(t, bam_path, fasta_sequences, genome): validate_bam_fasta_pairs(bam_path, fasta_sequences, genome) return [FileID.forPath(t.importFile('file://' + bam_path), bam_path), FileID.forPath(t.importFile('file://' + bam_path + '.bai'), bam_path + '.bai')] fasta = pyfasta.Fasta(hints_args.fasta) fasta_sequences = {(x.split()[0], len(fasta[x])) for x in fasta.keys()} with Toil(toil_options) as t: if not t.options.restart: # load the RNA-seq data, if we have any bam_file_ids = {'BAM': {}, 'INTRONBAM': {}} for dtype in ['BAM', 'INTRONBAM']: if hints_args.genome not in hints_args.cfg[dtype]: continue for bam_path in hints_args.cfg[dtype][hints_args.genome]: bam_file_ids[dtype][os.path.basename(bam_path)] = validate_import_bam(t, bam_path, fasta_sequences, hints_args.genome) # load the IsoSeq data, if we have any iso_seq_file_ids = [] if hints_args.genome in hints_args.cfg['ISO_SEQ_BAM']: for bam_path in hints_args.cfg['ISO_SEQ_BAM'][hints_args.genome]: validate_bam_fasta_pairs(bam_path, fasta_sequences, hints_args.genome) iso_seq_file_ids.append(validate_import_bam(t, bam_path, fasta_sequences, hints_args.genome)) if hints_args.annotation_gp is None: annotation_file_id = None else: annotation_file_id = FileID.forPath(t.importFile('file://' + hints_args.annotation_gp), hints_args.annotation_gp) if hints_args.protein_fasta is None: protein_fasta_file_id = genome_fasta_file_id = None else: protein_fasta_file_id = FileID.forPath(t.importFile('file://' + hints_args.protein_fasta), hints_args.protein_fasta) genome_fasta_file_id = FileID.forPath(t.importFile('file://' + hints_args.fasta), hints_args.fasta) input_file_ids = {'bams': bam_file_ids, 'iso_seq_bams': iso_seq_file_ids, 'annotation': annotation_file_id, 'protein_fasta': protein_fasta_file_id, 'genome_fasta': genome_fasta_file_id} if len(input_file_ids['bams']) + len(input_file_ids['iso_seq_bams']) > 0: logger.info('All BAMs validated for {}. Beginning Toil hints pipeline'.format(hints_args.genome)) disk_usage = tools.toilInterface.find_total_disk_usage(input_file_ids) job = Job.wrapJobFn(setup_hints, input_file_ids, disk=disk_usage) combined_hints = t.start(job) else: logger.info('Restarting Toil hints pipeline for {}.'.format(hints_args.genome)) combined_hints = t.restart() tools.fileOps.ensure_file_dir(hints_args.hints_path) t.exportFile(combined_hints, 'file://' + hints_args.hints_path)
def lobstrindex(args): """ %prog lobstrindex hg38.trf.bed hg38.upper.fa hg38 Make lobSTR index. Make sure the FASTA contain only upper case (so use fasta.format --upper to convert from UCSC fasta). The bed file is generated by str(). """ p = OptionParser(lobstrindex.__doc__) p.add_option("--fixseq", action="store_true", default=False, help="Scan sequences to extract perfect STRs") p.set_home("lobstr") opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) trfbed, fastafile, pf = args lhome = opts.lobstr_home mkdir(pf) if opts.fixseq: genome = pyfasta.Fasta(fastafile) newbedfile = trfbed + ".new" newbed = open(newbedfile, "w") fp = open(trfbed) retained = total = 0 for row in fp: s = STRLine(row) total += 1 for ns in s.iter_exact_str(genome): if not ns.is_valid(): continue print >> newbed, ns retained += 1 newbed.close() logging.debug("Retained: {0}".format(percentage(retained, total))) else: newbedfile = trfbed mm = MakeManager() cmd = "python {0}/scripts/lobstr_index.py".format(lhome) cmd += " --str {0} --ref {1} --out {2}".format(newbedfile, fastafile, pf) mm.add((newbedfile, fastafile), op.join(pf, "lobSTR_ref.fasta.rsa"), cmd) tabfile = "{0}/index.tab".format(pf) cmd = "python {0}/scripts/GetSTRInfo.py".format(lhome) cmd += " {0} {1} > {2}".format(newbedfile, fastafile, tabfile) mm.add((newbedfile, fastafile), tabfile, cmd) infofile = "{0}/index.info".format(pf) cmd = "cp {0} {1}".format(trfbed, infofile) mm.add(trfbed, infofile, cmd) mm.write()
def __init__(self, gtf_file, fasta_file): self.gtf_file = gtf_file def map_key(key): return key.split(' ')[0] self.fa = pyfasta.Fasta(fasta_file) self.fasta_file = fasta_file self.mapkeys = dict() for k in self.fa.keys(): self.mapkeys[map_key(k)] = k
def __init__ (self, name, fasta, compress=True): """ Create a reference object extract fasta ref if needed and create a sequence object per sequences found in the fasta file @param name Name of the Reference @param fasta Path to a fasta file (can be gzipped) @param compress Fasta output will be gzipped if True """ print(("Create {} object".format(name))) # Create self variables self.name = name self.temp_dir = mkdtemp() self.compress = compress # Create a name for the fasta file to be generated self.modified_fasta = "{}_masked.fa{}".format(self.name, ".gz" if self.compress else "") try: # Test values assert self.name not in self.REFERENCE_NAMES, "Reference name <{}> is duplicated".format(self.name) assert is_readable_file(fasta), "{} is not a valid file".format(fasta) # If gziped, ungzip the reference fasta file in the temporary folder. If not compress # copy in the temporary folder if is_gziped(fasta): print (" * Unzip fasta file in a temporary directory") self.fasta = gunzip(fasta, self.temp_dir) else: print (" * Copy fasta file in a temporary directory") self.fasta = cp(fasta, self.temp_dir) # Loading the fasta sequence in a pyfasta.Fasta (seq_record is a mapping) print (" * Parsing the file with pyfasta") seq_dict = {} fasta_record = pyfasta.Fasta(self.fasta, flatten_inplace=True) print((" * Found {} sequences in {}".format (len (fasta_record), self.name))) for name, seq_record in list(fasta_record.items()): # Remove additional sequence descriptor in fasta header and create a Sequence object short_name = name.partition(" ")[0] assert short_name not in seq_dict, "Reference name <{}> is duplicated in <{}>".format(short_name,self.name) seq_dict[short_name] = Sequence(name=short_name, seq_record=seq_record) # Save to a name sorted ordered dict self.seq_dict = OrderedDict(sorted(list(seq_dict.items()), key=lambda x: x)) # Add name to a class list self.ADD_TO_REFERENCE_NAMES(self.name) except Exception as E: self.clean() raise E
def __init__(self, ref_path, bg=None): ref_manager = ReferenceManager(ref_path) self.all_motifs = [] if ref_manager.motifs is not None: with open(ref_manager.motifs, "r") as infile: self.all_motifs = list(motifs.parse(infile, "jaspar")) # for large sequence header, only keep the text before the first space self.genome_seq = pyfasta.Fasta(ref_manager.fasta, key_fn=lambda x: x.split()[0]) self.bg = bg
def open_reference(reference_path): ''' Open a reference fasta and rename the contigs to strip any fasta comments''' fasta = pyfasta.Fasta(get_fasta(reference_path)) new_fasta = {} for (k,v) in fasta.iteritems(): key_prefix = k.split(" ")[0] new_fasta[key_prefix] = v return new_fasta
def circSeq(genomeFile,outPrefix,thread): global genome,FL genome=pyfasta.Fasta(genomeFile) FL=pd.read_csv(outPrefix+'constructFL_Normal_adj.txt',sep='\t',dtype={'exon_start':str,'exon_end':str}) fout=open(outPrefix+'circSeq.fa','w') pool=Pool(processes=thread) seq=pool.map(getSeq,range(FL.shape[0])) pool.close() pool.join() for i in range(len(seq)): fout.write('>'+seq[i][0]+'\n'+seq[i][1]+'\n') fout.close() tidehunter(outPrefix+'circSeq.fa',outPrefix+'circSeq.th',thread)
def main(opts): """ main function :param opts: input parameters :return: file containing flanking sequences with mutations as iupac nucleotides, and vcf file with the selected sequences. """ # input files vcf_reader = vcf.Reader(filename=opts.vcf_in) # read in raw vcf reference = pyfasta.Fasta(opts.reference) # read in reference genome with open(opts.snp_set, 'r') as f: # open selected markers but only first line first_line = f.readline() # output files primer_seq = open(opts.fasta_out, "w") # output iupack nucleotide file # output vcf with only the selected sequences from raw vcf file. (contains all the info) writer_willem = vcf.Writer(open(opts.vcf_out, 'w'), vcf_reader, lineterminator='\n') # For each of the selected snps snp_index = first_line.split() for snp in snp_index: coordinate = snp.rsplit(".", 1) scaffold_len = len(str(reference[coordinate[0]])) start = int(coordinate[1]) - opts.length stop = int(coordinate[1]) + opts.length # if reference sequence is not long enough ajust the lengths of start and stop. if start < 0: start = 0 if stop > scaffold_len: stop = scaffold_len snp_seq = reference[coordinate[0]][start:stop] snp_locs, snp_call, main_snp = get_all_snps(vcf_reader, coordinate, start, stop, writer_willem, opts.maf, opts.call_rate) new_seq = parse_sequence(coordinate[1], snp_locs, snp_seq, snp_call, main_snp, start) primer_seq.write(coordinate[0] + ":" + str(start) + "-" + str(stop) + ":" + coordinate[1] + "\t") primer_seq.write(new_seq + "\n") primer_seq.close() writer_willem.close()
def main(): parser = argparse.ArgumentParser(description="Extract informative sites") parser.add_argument("aligned_fasta") parser.add_argument("-v", "--vcf", dest="vcf", help="Extract genotypes from VCF") args = parser.parse_args() fasta_r = pyfasta.Fasta(args.aligned_fasta) indels = None if args.vcf is not None: # We should generate a list of indels for each sample indels = get_indels_from_vcf(args.vcf) extract_informative(fasta_r, indels)
def pyfasta_fasta(n): print('timings for pyfasta.Fasta') ti = [] tf = [] for _ in range(n): t = time.time() f = pyfasta.Fasta(fa_file.name) ti.append(time.time() - t) t = time.time() read_dict(f, headers) tf.append(time.time() - t) os.remove(fa_file.name + '.flat') os.remove(fa_file.name + '.gdx') # profile memory usage and report timings tracemalloc.start() f = pyfasta.Fasta(fa_file.name) read_dict(f, headers) os.remove(fa_file.name + '.flat') os.remove(fa_file.name + '.gdx') print(tracemalloc.get_traced_memory()) print(mean(ti)) print(mean(tf)/nreads/10*1000*1000) tracemalloc.stop()
def get_generator(self, loop_infinitely): #read bed_source into memory bed_fh = fp.get_file_handle(self.bed_source) data = [] print("Reading bed file " + self.bed_source + " into memory") for a_row in bed_fh: a_row = a_row.rstrip().split("\t") data.append( Interval(chrom=a_row[0], start=int(a_row[1]), stop=int(a_row[2]), labels=[self.labels_dtype(x) for x in a_row[3:]])) print("Finished reading bed file into memory; got " + str(len(data)) + "rows") if (self.num_to_load_for_eval > len(data)): print("num_to_load_for_eval is " + str(self.num_to_load_for_eval) + " but length of data is " + str(len(data)) + "; adjusting") self.num_to_load_for_eval = len(data) random_obj = np.random.RandomState(self.random_seed) if (self.randomize_after_pass): data = shuffle_array(arr=data, random_obj=random_obj) #fasta extraction import pyfasta f = pyfasta.Fasta(self.fasta_data_source) idx = 0 while (idx < len(data)): to_extract = data[idx:idx + 1] if (idx % 1000 == 0): print(to_extract) to_yield = f[ to_extract[0].chrom][to_extract[0].start:to_extract[0].stop] to_yield = np.array([one_hot_encode[x] for x in to_yield]) yield (to_yield, to_extract[0].labels, (to_extract[0].chrom, to_extract[0].start, to_extract[0].stop)) idx += 1 if (idx == len(data)): if (loop_infinitely): if (self.randomize_after_pass): data = shuffle_array(arr=data, random_obj=random_obj) idx = 0 else: raise StopIteration()
def __init__(self, fasta_path, gff3_path, seqid=None): """ An annotated reference genome. Parameters ---------- fasta_path : string Path to reference genome FASTA file. gff3_path : string Path to genome annotations GFF3 file. """ # store initialisation parameters self._fasta_path = fasta_path self._gff3_path = gff3_path self._seqid = seqid # setup access to reference sequence self._fasta = pyfasta.Fasta(fasta_path) # setup access to GFF3 as a table if isinstance(gff3_path, (list, tuple)): tbl_features = etl.cat(*[etl.fromgff3(p) for p in gff3_path]) else: tbl_features = etl.fromgff3(gff3_path) tbl_features = (tbl_features.unpackdict( 'attributes', ['ID', 'Parent']).rename({ 'ID': 'feature_id', 'Parent': 'parent_id', 'end': 'stop' }).select(lambda row: (row.stop - row.start) > 0)) # limit data to a single chromosome if seqid is not None: tbl_features = tbl_features.eq('seqid', seqid) self._tbl_features = tbl_features.cache() # index features by ID self._idx_feature_id = self._tbl_features.recordlookupone('feature_id') # index features by parent ID self._idx_parent_id = self._tbl_features.recordlookup('parent_id') # index features by genomic location self._idx_location = self._tbl_features.facetintervalrecordlookup( 'seqid', 'start', 'stop', include_stop=True)
def main(): try: strtablefile = sys.argv[1] if strtablefile == "-": strtablefile = "/dev/stdin" genomeFile = sys.argv[2] if genomeFile == "-": genomeFile = "/dev/stdin" except: print __doc__ sys.exit(1) sys.stderr.write("loading genome...\n") genome = pyfasta.Fasta(genomeFile) refkeys = dict([(key.split()[0], key) for key in genome.keys()]) sys.stderr.write("processing each locus...\n") print "\t".join(["chrom", "start", "end", "score", "GC", "entropy"]) ProcessEachLocus(strtablefile, genome, refkeys)
def adjExplainNormal(genomeFile, outPrefix, thread, isSecond=False): global genome, FLdf_2, targetID_dict genome = pyfasta.Fasta(genomeFile) FLdf = pd.read_csv(outPrefix + "explainFL_Normal.txt", sep='\t') FLdf = FLdf.sort_values(by=["ID", "query_start"], ascending=True) FLdf_counts = FLdf['ID'].value_counts() FLdf.index = FLdf.ID FLdf_2 = FLdf.loc[FLdf_counts.index[FLdf_counts.values == 2], :].copy() FLdf_no2 = FLdf.loc[FLdf_counts.index[FLdf_counts.values != 2], :].copy() targetID = list(set(FLdf_2.index)) targetID_dict = {} for i in range(FLdf_2.shape[0]): if targetID_dict.__contains__(FLdf_2.index[i]): targetID_dict[FLdf_2.index[i]].append(i) else: targetID_dict[FLdf_2.index[i]] = [i] pool = Pool(processes=thread) result = pool.map(getNewFL, targetID) pool.close() pool.join() newFL_2 = [] for i in result: for j in i: newFL_2.append(j) newFL_2 = pd.DataFrame(np.array(newFL_2).reshape(-1, 12)) newFL_2.columns = FLdf_no2.columns result = pd.concat([newFL_2, FLdf_no2]) result.to_csv(outPrefix + "explainFL_Normal_adj.txt", sep="\t", header=True, index=False) if isSecond: circOrigin = result.loc[:, ['ID', 'strand']].copy() strandScore = [] for i in circOrigin.strand: if i == '+': strandScore.append(1) elif i == '-': strandScore.append(-1) else: strandScore.append(0) circOrigin.strand = strandScore circOrigin.columns = ['ID', 'score'] circOrigin = circOrigin.drop_duplicates('ID') return (circOrigin)