def process_fasta_length(fasta_file, folder, debug): len_Dataframe = pd.DataFrame(columns=('len', 'ID', 'seq')) ## read file with open(fasta_file, 'r') as fh: lines = [] for line in fh: lines.append(line.rstrip()) if len(lines) == 2: record = fasta_functions.process_fasta(lines) # re-init lines = [] len_Seq = len(record['sequence']) len_Dataframe.loc[len(len_Dataframe)] = (len_Seq, record['name'], record['sequence']) ## grouped_df = len_Dataframe.groupby(['len']) len_dict = {} for len_int, cluster in grouped_df: ## write file file_name = os.path.join(folder, 'seqs_len' + str(len_int) + '.fa') ## debugging messages if debug: print("** Printing reads of length (%s) in file %s" % (len_int, file_name)) with open(file_name, 'w') as outfh: for index, row in cluster.iterrows(): outfh.write(row['ID'] + '\n' + row['seq'] + '\n') outfh.close() len_dict[len_int] = file_name return (len_dict)
col_list = list(frequencies_miRNA) ## get columns print("# Selecting variants from file:" + args.fasta) print("# Printing isomiRs sequences in fasta: " + args.out + '.fasta') ## new df isomiRs_seqs = pd.DataFrame(0, index=frequencies_miRNA.index, columns=col_list) ## read file with open(args.out + '.fasta', 'w') as outfh: with open(args.fasta, 'r') as fh: lines = [] for line in fh: lines.append(line.rstrip()) if len(lines) == 2: record = fasta_functions.process_fasta(lines) # re-init lines = [] ## discard: # e.g. >hsa-mir-518f-5p::>hsa-mir-520c-5p|>hsa-mir-526a-5p|>hsa-mir-518d-5p|TS-7511 # e.g. >hsa-mir-548h-3p::>hsa-mir-548z|TS-5966 if (re.search('.*::>.*', record['name'])): continue ## parse the others list_split = record['name'].split('::') #print (list_split) miRNA = list_split[0].replace('>', '') variant_list = list_split[1].split('-')
def discard_revcomp(outfile_path, reads): ##### Remove non 5'-3' simulated reads ## use art illumina aln file generated for R1 if (reads == 'PE'): aln_file_R1 = outfile_path + '1.aln' else: aln_file_R1 = outfile_path + '.aln' ## read aln file freq_fasta = defaultdict(int) fastq_dict = defaultdict(int) with open(aln_file_R1, 'r') as fh: lines = [] for line in fh: if line.startswith('#'): continue if line.startswith('@'): continue if line.startswith('>'): line_list = line.rstrip().split('\t') if line_list[3] == '+': ID = line_list[1] lines.append(ID[:-2]) continue else: if len(lines) == 1: lines.append(line.rstrip()) if len(lines) == 2: record = fasta_functions.process_fasta(lines) ##sys.stderr.write("Record: %s\n" % (str(record))) lines = [] ## add sequences & count freq_fasta[record['sequence']] += 1 if (reads == 'PE'): ## read R1 fastq file fastq_file = outfile_path + '1.fq' out_file = outfile_path + '_filter_R1.fq' else: fastq_file = outfile_path + '.fq' out_file = outfile_path + '_filter.fq' ## print in file with open(out_file, 'w') as file: with open(fastq_file, 'r') as fh: lines = [] for line in fh: lines.append(line.rstrip()) if len(lines) == 4: record = fasta_functions.process_fastq(lines) #sys.stderr.write("Record: %s\n" % (str(record))) lines = [] fastq_ID = record['name'].replace('/1', '/2') if record['sequence'] in freq_fasta.keys(): file.write("%s\n%s\n%s\n%s\n" % (record['name'], record['sequence'], record['optional'], record['quality'])) fastq_dict[fastq_ID] += 1 file.close() return (fastq_dict)