Example #1
0
def type_sequences( input, grouping=GROUPING,
                           exon_fofn=None,
                           genomic_reference=None,
                           cDNA_reference=None,
                           loci=None):
    """
    Pick the top Amplicon Analysis consensus seqs from a Fasta by Nreads
    """
    log_file = get_log_file( input )
    initialize_logger( log, log_file=log_file )

    # First, get any references not specified by the user
    grouping = grouping or GROUPING
    exon_fofn = exon_fofn or get_exon_reference()
    genomic_reference = genomic_reference or get_genomic_reference()
    cDNA_reference = cDNA_reference or get_cDNA_reference()

    # Second, get the input file if a directory was specified
    sequence_file = get_input_file( input )

    # Finally, run the Typing procedure
    renamed_file = rename_sequences( sequence_file )
    raw_alignment = full_align_best_reference( renamed_file, genomic_reference )
    reoriented = orient_sequences( renamed_file, alignment_file=raw_alignment )
    selected = extract_alleles( reoriented, alignment_file=raw_alignment,
                                            method=grouping,
                                            loci=loci)
    gDNA_alignment = full_align_best_reference( selected, genomic_reference )
    cDNA_file = extract_cDNA( selected, exon_fofn, alignment_file=gDNA_alignment )
    cDNA_alignment = align_by_identity( cDNA_file, cDNA_reference )
    typing = summarize_typing( gDNA_alignment, cDNA_alignment )
    return typing
Example #2
0
def fetch_chimera_scores(folder, reference):
    # Check and read the reference data
    reference_files = find_reference_files(reference)

    # Check and read the query data
    barcode_files = split_results(folder)

    # Check and read the log data
    log_data = read_log_data(folder)

    good = []
    bad = []
    for barcode, reference in reference_files.iteritems():
        sample_file = barcode_files[barcode]
        alignments = full_align_best_reference(sample_file, reference)
        by_reference = hits_by_reference(alignments)
        best, rest = separate_best_hits(by_reference)
        good += best
        bad += rest

    print "Found %s 'good' and %s 'bad' consensus sequences" % (len(good),
                                                                len(bad))
    good_scores = sorted({r.qname: log_data[r.qname]
                          for r in good}.iteritems(),
                         key=itemgetter(1),
                         reverse=True)
    bad_scores = sorted({r.qname: log_data[r.qname]
                         for r in bad}.iteritems(),
                        key=itemgetter(1))
    print[k for k in good_scores[:5]]
    print[k for k in bad_scores]
Example #3
0
def type_sequences( input_folder, exon_fofn, genomic_reference, cDNA_reference ):
    """
    Pick the top N Amplicon Analysis consensus seqs from a Fasta by Nreads
    """
    sequence_file = os.path.join( input_folder, 'amplicon_analysis.fastq' )
    csv_file = os.path.join( input_folder, 'amplicon_analysis.csv' )
    # First we align the sequences to the reference and annotate typing
    raw_alignment = align_best_reference( sequence_file, genomic_reference )
    reoriented = orient_sequences( sequence_file, alignment_file=raw_alignment )
    reoriented_csv = orient_amp_analysis( csv_file, raw_alignment )
    selected = extract_alleles( reoriented, alignment_file=raw_alignment )
    selected_csv = subset_amp_analysis( reoriented_csv, selected )
    gDNA_alignment = full_align_best_reference( selected, genomic_reference )
    cDNA_file = extract_cDNA( selected, exon_fofn, alignment_file=gDNA_alignment )
    cDNA_alignment = align_by_identity( cDNA_file, cDNA_reference )
    summarize_typing( gDNA_alignment, cDNA_alignment )
Example #4
0
def type_sequences(input_folder, exon_fofn, genomic_reference, cDNA_reference):
    """
    Pick the top N Amplicon Analysis consensus seqs from a Fasta by Nreads
    """
    sequence_file = os.path.join(input_folder, 'amplicon_analysis.fastq')
    csv_file = os.path.join(input_folder, 'amplicon_analysis.csv')
    # First we align the sequences to the reference and annotate typing
    raw_alignment = align_best_reference(sequence_file, genomic_reference)
    reoriented = orient_sequences(sequence_file, alignment_file=raw_alignment)
    reoriented_csv = orient_amp_analysis(csv_file, raw_alignment)
    selected = extract_alleles(reoriented, alignment_file=raw_alignment)
    selected_csv = subset_amp_analysis(reoriented_csv, selected)
    gDNA_alignment = full_align_best_reference(selected, genomic_reference)
    cDNA_file = extract_cDNA(selected,
                             exon_fofn,
                             alignment_file=gDNA_alignment)
    cDNA_alignment = align_by_identity(cDNA_file, cDNA_reference)
    summarize_typing(gDNA_alignment, cDNA_alignment)
Example #5
0
def type_fasta( input_fofn, input_fasta, exon_fofn, genomic_reference, cDNA_reference ):
    """
    Pick the top N Amplicon Analysis consensus seqs from a Fasta by Nreads
    """
    # First we align the sequences to the reference and annotate typing
    raw_alignment = align_best_reference( input_fasta, genomic_reference )
    reoriented = orient_fasta( input_fasta, alignment_file=raw_alignment )
    selected = extract_alleles( reoriented, alignment_file=raw_alignment )
    gDNA_alignment = full_align_best_reference( selected, genomic_reference )
    cDNA_file = extract_cDNA( selected, exon_fofn, alignment_file=gDNA_alignment )
    cDNA_alignment = align_by_identity( cDNA_file, cDNA_reference )
    summarize_typing( gDNA_alignment, cDNA_alignment )
    # Next we generate some mock chimera sequences
    chimera_file = create_chimeras( selected, alignment_file=gDNA_alignment )
    basename = '.'.join( chimera_file.split('.')[:-2] )
    combined_file = '%s.combined.fasta' % basename
    combine_fasta( [input_fasta, chimera_file], combined_file )
    # Finally we use a competetive alignment of best-reads to summarize the allelic breakdown
    dirname = os.path.dirname( input_fasta )
    best_reads = os.path.join( dirname, 'reads_of_insert.fasta' )
    extract_best_reads( input_fofn, best_reads )
    best_alignment = align_best_reference( best_reads, combined_file )
    summarize_alleles( best_alignment, raw_alignment, selected )
Example #6
0
def fetch_chimera_scores(folder, reference):
    # Check and read the reference data
    reference_files = find_reference_files(reference)

    # Check and read the query data
    barcode_files = split_results(folder)

    # Check and read the log data
    log_data = read_log_data(folder)

    good = []
    bad = []
    for barcode, reference in reference_files.iteritems():
        sample_file = barcode_files[barcode]
        alignments = full_align_best_reference(sample_file, reference)
        by_reference = hits_by_reference(alignments)
        best, rest = separate_best_hits(by_reference)
        good += best
        bad += rest

    print "Found %s 'good' and %s 'bad' consensus sequences" % (len(good), len(bad))
    good_scores = sorted({r.qname: log_data[r.qname] for r in good}.iteritems(), key=itemgetter(1), reverse=True)
    bad_scores = sorted({r.qname: log_data[r.qname] for r in bad}.iteritems(), key=itemgetter(1))
    print [k for k in good_scores[:5]]