Beispiel #1
0
def process_fasta_length(fasta_file, folder, debug):

    len_Dataframe = pd.DataFrame(columns=('len', 'ID', 'seq'))

    ## read file
    with open(fasta_file, 'r') as fh:
        lines = []
        for line in fh:
            lines.append(line.rstrip())
            if len(lines) == 2:
                record = fasta_functions.process_fasta(lines)
                # re-init
                lines = []
                len_Seq = len(record['sequence'])
                len_Dataframe.loc[len(len_Dataframe)] = (len_Seq,
                                                         record['name'],
                                                         record['sequence'])

    ##
    grouped_df = len_Dataframe.groupby(['len'])
    len_dict = {}

    for len_int, cluster in grouped_df:
        ## write file
        file_name = os.path.join(folder, 'seqs_len' + str(len_int) + '.fa')

        ## debugging messages
        if debug:
            print("** Printing reads of length (%s) in file %s" %
                  (len_int, file_name))

        with open(file_name, 'w') as outfh:
            for index, row in cluster.iterrows():
                outfh.write(row['ID'] + '\n' + row['seq'] + '\n')
        outfh.close()
        len_dict[len_int] = file_name

    return (len_dict)
Beispiel #2
0
col_list = list(frequencies_miRNA)  ## get columns

print("# Selecting variants from file:" + args.fasta)
print("# Printing isomiRs sequences in fasta: " + args.out + '.fasta')

## new df
isomiRs_seqs = pd.DataFrame(0, index=frequencies_miRNA.index, columns=col_list)

## read file
with open(args.out + '.fasta', 'w') as outfh:
    with open(args.fasta, 'r') as fh:
        lines = []
        for line in fh:
            lines.append(line.rstrip())
            if len(lines) == 2:
                record = fasta_functions.process_fasta(lines)
                # re-init
                lines = []

                ## discard:
                # e.g. >hsa-mir-518f-5p::>hsa-mir-520c-5p|>hsa-mir-526a-5p|>hsa-mir-518d-5p|TS-7511
                # e.g. >hsa-mir-548h-3p::>hsa-mir-548z|TS-5966
                if (re.search('.*::>.*', record['name'])):
                    continue

                ## parse the others
                list_split = record['name'].split('::')

                #print (list_split)
                miRNA = list_split[0].replace('>', '')
                variant_list = list_split[1].split('-')
Beispiel #3
0
def discard_revcomp(outfile_path, reads):
    ##### Remove non 5'-3' simulated reads
    ## use art illumina aln file generated for R1
    if (reads == 'PE'):
        aln_file_R1 = outfile_path + '1.aln'
    else:
        aln_file_R1 = outfile_path + '.aln'

    ## read aln file
    freq_fasta = defaultdict(int)
    fastq_dict = defaultdict(int)
    with open(aln_file_R1, 'r') as fh:
        lines = []
        for line in fh:
            if line.startswith('#'):
                continue
            if line.startswith('@'):
                continue
            if line.startswith('>'):
                line_list = line.rstrip().split('\t')
                if line_list[3] == '+':
                    ID = line_list[1]
                    lines.append(ID[:-2])
                continue
            else:
                if len(lines) == 1:
                    lines.append(line.rstrip())

            if len(lines) == 2:
                record = fasta_functions.process_fasta(lines)
                ##sys.stderr.write("Record: %s\n" % (str(record)))
                lines = []

                ## add sequences & count
                freq_fasta[record['sequence']] += 1

    if (reads == 'PE'):
        ## read R1 fastq file
        fastq_file = outfile_path + '1.fq'
        out_file = outfile_path + '_filter_R1.fq'
    else:
        fastq_file = outfile_path + '.fq'
        out_file = outfile_path + '_filter.fq'

    ## print in file
    with open(out_file, 'w') as file:
        with open(fastq_file, 'r') as fh:
            lines = []
            for line in fh:
                lines.append(line.rstrip())
                if len(lines) == 4:
                    record = fasta_functions.process_fastq(lines)
                    #sys.stderr.write("Record: %s\n" % (str(record)))
                    lines = []
                    fastq_ID = record['name'].replace('/1', '/2')

                    if record['sequence'] in freq_fasta.keys():
                        file.write("%s\n%s\n%s\n%s\n" %
                                   (record['name'], record['sequence'],
                                    record['optional'], record['quality']))
                        fastq_dict[fastq_ID] += 1

    file.close()
    return (fastq_dict)