def count_barcodes(dataset, VERBOSE=0):
    '''Count the abundance of each barcode'''

    # Get the read filenames
    data_filenames = get_raw_read_files(dataset)
    datafile = data_filenames['adapter']

    # Count the abundance of each barcode
    bc_counts = defaultdict(int)
    rc = 0
    with open(datafile, 'r') as infile:
        for read in SeqIO.parse(infile, 'fastq'):
            bc_counts[read.seq.tostring()] += 1
            rc += 1
            if rc == maxreads:
                break
    
    print sorted(bc_counts.items(), key=lambda x:x[1], reverse=True)[:20]
    
    # Plot results
    plt.figure()
    ax=plt.subplot(111)
    plt.plot(range(1,len(bc_counts)+1), sorted(bc_counts.values(), reverse=True))
    ax.set_yscale('log')
    ax.set_xscale('log')
    plt.xlabel('barcode rank')
    plt.ylabel('abundance')

    plt.ion()
    plt.show()
Exemple #2
0
    if summary:
        with open(get_demultiplex_summary_filename(data_folder), 'w') as f:
            f.write('Call: python demultiplex.py --run ' + seq_run +
                    ' --verbose ' + str(VERBOSE) + '\n')

    adapters_designed = get_adapters_designed(dataset,
                                              VERBOSE=VERBOSE,
                                              summary=summary)

    make_output_folders(data_folder,
                        adapters_designed,
                        VERBOSE=VERBOSE,
                        summary=summary)

    data_filenames = get_raw_read_files(dataset)

    # Is it a dual index library?
    if '-' not in adapters_designed[0][0]:
        demultiplex_reads_single_index(data_folder,
                                       data_filenames,
                                       adapters_designed,
                                       maxreads=maxreads,
                                       VERBOSE=VERBOSE,
                                       summary=summary)
    else:
        demultiplex_reads_dual_index(data_folder,
                                     data_filenames,
                                     adapters_designed,
                                     maxreads=maxreads,
                                     VERBOSE=VERBOSE,
    # If submit, outsource to the cluster
    if submit:
        fork_self(seq_run, VERBOSE=VERBOSE, maxreads=maxreads, summary=summary)
        sys.exit()

    # Specify the dataset
    dataset = MiSeq_runs[seq_run]
    data_folder = dataset['folder']

    if summary:
        with open(get_demultiplex_summary_filename(data_folder), 'w') as f:
            f.write('Call: python demultiplex.py --run '+seq_run+' --verbose '+str(VERBOSE)+'\n')

    adapters_designed = get_adapters_designed(dataset, VERBOSE=VERBOSE, summary=summary)

    make_output_folders(data_folder, adapters_designed, VERBOSE=VERBOSE,
                        summary=summary)

    data_filenames = get_raw_read_files(dataset)

    # Is it a dual index library?
    if '-' not in adapters_designed[0][0]:
        demultiplex_reads_single_index(data_folder, data_filenames, adapters_designed,
                                       maxreads=maxreads, VERBOSE=VERBOSE,
                                       summary=summary)
    else:
        demultiplex_reads_dual_index(data_folder, data_filenames, adapters_designed,
                                     maxreads=maxreads, VERBOSE=VERBOSE,
                                     summary=summary)