Ejemplo n.º 1
0
def order_references( subread_file, reference_file ):
    """
    Select the two best reference sequences from a list
    """
    log.info("Selecting the best references sequences to use")
    temp = 'temp.m1'
    if not valid_file( temp ):
        align_best_reference( subread_file, reference_file, temp )
    c = Counter([hit.tname for hit in BlasrReader(temp)])
    return [k for k, v in c.most_common()]
Ejemplo n.º 2
0
def sort_subreads( subread_file, reference_file ):
    """
    Aligning
    """
    log.info("Aligning subreads to the two best references")
    temp = 'temp2.m1'
    if valid_file( temp ):
        return {hit.qname: hit.tname for hit in BlasrReader(temp)}
    align_best_reference( subread_file, reference_file, temp )
    return {hit.qname: hit.tname for hit in BlasrReader(temp)}
Ejemplo n.º 3
0
def sort_subreads(subread_file, reference_file):
    """
    Aligning
    """
    log.info("Aligning subreads to the two best references")
    temp = 'temp2.m1'
    if valid_file(temp):
        return {hit.qname: hit.tname for hit in BlasrReader(temp)}
    align_best_reference(subread_file, reference_file, temp)
    return {hit.qname: hit.tname for hit in BlasrReader(temp)}
Ejemplo n.º 4
0
def order_references(subread_file, reference_file):
    """
    Select the two best reference sequences from a list
    """
    log.info("Selecting the best references sequences to use")
    temp = 'temp.m1'
    if not valid_file(temp):
        align_best_reference(subread_file, reference_file, temp)
    c = Counter([hit.tname for hit in BlasrReader(temp)])
    return [k for k, v in c.most_common()]
Ejemplo n.º 5
0
 def align_contigs_to_genome(self, contig_file):
     log.info("Looking for Contig-to-Genome alignment data")
     contig_genome_align = self.get_filepath( 'alignments', 'contigs_to_genome.m1' )
     if valid_file( contig_genome_align ):
         log.info("Using existing Contig->Genome alignment file\n")
     else:
         log.info("No Contig->Genome alignment found, creating...")
         align_best_reference( contig_file, self.human_reference, output=contig_genome_align )
         check_output_file( contig_genome_align )
         log.info("Finished aligning contigs to the genomic reference\n")
     return create_m1_reference( contig_genome_align )
Ejemplo n.º 6
0
def _align_sequences(query, reference):
    """
    Align one fasta file of sequences to another
    """
    temp = NamedTemporaryFile(suffix='.m1', delete=False)
    align_best_reference(query, reference, output=temp.name)
    if valid_file(temp.name):
        hits = list(BlasrReader(temp.name))
        os.unlink(temp.name)
        return hits
    os.unlink(temp.name)
    return None
Ejemplo n.º 7
0
 def align_contigs_to_reference(self, contig_file, reference_file):
     """
     Align HBAR contigs to an HLA reference Fasta
     """
     log.info("Looking for Contig-to-Reference alignment data")
     contig_reference_align = self.get_filepath( 'alignments', 'contigs_to_reference.m1' )
     if valid_file( contig_reference_align ):
         log.info("Using an existing Contig->Reference alignment file\n")
     else:
         log.info("No Contig->Reference alignment found, creating...")
         align_best_reference( contig_file, reference_file, output=contig_reference_align )
         check_output_file( contig_reference_align )
         log.info("Finished aligning contigs to the HLA reference data\n")
     return create_m1_reference( contig_reference_align )
Ejemplo n.º 8
0
 def align_subreads_to_contigs(self, subread_file, contig_file ):
     """
     Align the subreads to the contigs assembled by HBAR
     """
     log.info("Looking for Subread-to-Contig alignment data")
     subread_contig_align = self.get_filepath( 'alignments', 'subreads_to_contigs.m1' )
     if valid_file( subread_contig_align ):
         log.info("Using existing Subread->Contig alignment file\n")
     else:
         log.info("No Subread->Contig alignment found, creating...")
         align_best_reference( subread_file, contig_file, output=subread_contig_align )
         check_output_file( subread_contig_align )
         log.info("Finished aligning subreads to the HBAR contigs\n")
     return create_m1_reference( subread_contig_align )
Ejemplo n.º 9
0
 def align_contigs_to_genome(self, contig_file):
     log.info("Looking for Contig-to-Genome alignment data")
     contig_genome_align = self.get_filepath('alignments',
                                             'contigs_to_genome.m1')
     if valid_file(contig_genome_align):
         log.info("Using existing Contig->Genome alignment file\n")
     else:
         log.info("No Contig->Genome alignment found, creating...")
         align_best_reference(contig_file,
                              self.human_reference,
                              output=contig_genome_align)
         check_output_file(contig_genome_align)
         log.info("Finished aligning contigs to the genomic reference\n")
     return create_m1_reference(contig_genome_align)
Ejemplo n.º 10
0
def create_chimeras(input_file,
                    output=None,
                    reference_file=None,
                    alignment_file=None):
    """
    Pick the top N Amplicon Analysis consensus seqs from a Fasta by Nreads
    """
    # Check the input files, and align the input file if needed
    if reference_file and alignment_file is None:
        alignment_file = align_best_reference(input_file, reference_file)
    elif reference_file is None and alignment_file is None:
        msg = "extract_alleles requires either an Alignment or a Reference!"
        log.error(msg)
        raise IOError(msg)
    # Set the output file if not specified
    if output is None:
        basename = '.'.join(input_file.split('.')[:-1])
        output = '%s.chimeras.fasta' % basename
    # Parse the alignment data and extract the target sequences
    alignments = list(BlasrReader(alignment_file))
    groups = _group_by_locus(alignments)
    groups = _filter_groups(groups)
    sequences = list(FastaReader(input_file))
    chimeras = list(_create_chimeras(groups, sequences))
    write_fasta(chimeras, output)
    return output
Ejemplo n.º 11
0
 def align_contigs_to_reference(self, contig_file, reference_file):
     """
     Align HBAR contigs to an HLA reference Fasta
     """
     log.info("Looking for Contig-to-Reference alignment data")
     contig_reference_align = self.get_filepath('alignments',
                                                'contigs_to_reference.m1')
     if valid_file(contig_reference_align):
         log.info("Using an existing Contig->Reference alignment file\n")
     else:
         log.info("No Contig->Reference alignment found, creating...")
         align_best_reference(contig_file,
                              reference_file,
                              output=contig_reference_align)
         check_output_file(contig_reference_align)
         log.info("Finished aligning contigs to the HLA reference data\n")
     return create_m1_reference(contig_reference_align)
Ejemplo n.º 12
0
 def align_subreads_to_contigs(self, subread_file, contig_file):
     """
     Align the subreads to the contigs assembled by HBAR
     """
     log.info("Looking for Subread-to-Contig alignment data")
     subread_contig_align = self.get_filepath('alignments',
                                              'subreads_to_contigs.m1')
     if valid_file(subread_contig_align):
         log.info("Using existing Subread->Contig alignment file\n")
     else:
         log.info("No Subread->Contig alignment found, creating...")
         align_best_reference(subread_file,
                              contig_file,
                              output=subread_contig_align)
         check_output_file(subread_contig_align)
         log.info("Finished aligning subreads to the HBAR contigs\n")
     return create_m1_reference(subread_contig_align)
Ejemplo n.º 13
0
def type_fasta( input_fofn, input_fasta, exon_fofn, genomic_reference, cDNA_reference ):
    """
    Pick the top N Amplicon Analysis consensus seqs from a Fasta by Nreads
    """
    # First we align the sequences to the reference and annotate typing
    raw_alignment = align_best_reference( input_fasta, genomic_reference )
    reoriented = orient_fasta( input_fasta, alignment_file=raw_alignment )
    selected = extract_alleles( reoriented, alignment_file=raw_alignment )
    gDNA_alignment = full_align_best_reference( selected, genomic_reference )
    cDNA_file = extract_cDNA( selected, exon_fofn, alignment_file=gDNA_alignment )
    cDNA_alignment = align_by_identity( cDNA_file, cDNA_reference )
    summarize_typing( gDNA_alignment, cDNA_alignment )
    # Next we generate some mock chimera sequences
    chimera_file = create_chimeras( selected, alignment_file=gDNA_alignment )
    basename = '.'.join( chimera_file.split('.')[:-2] )
    combined_file = '%s.combined.fasta' % basename
    combine_fasta( [input_fasta, chimera_file], combined_file )
    # Finally we use a competetive alignment of best-reads to summarize the allelic breakdown
    dirname = os.path.dirname( input_fasta )
    best_reads = os.path.join( dirname, 'reads_of_insert.fasta' )
    extract_best_reads( input_fofn, best_reads )
    best_alignment = align_best_reference( best_reads, combined_file )
    summarize_alleles( best_alignment, raw_alignment, selected )
Ejemplo n.º 14
0
def type_sequences( input_folder, exon_fofn, genomic_reference, cDNA_reference ):
    """
    Pick the top N Amplicon Analysis consensus seqs from a Fasta by Nreads
    """
    sequence_file = os.path.join( input_folder, 'amplicon_analysis.fastq' )
    csv_file = os.path.join( input_folder, 'amplicon_analysis.csv' )
    # First we align the sequences to the reference and annotate typing
    raw_alignment = align_best_reference( sequence_file, genomic_reference )
    reoriented = orient_sequences( sequence_file, alignment_file=raw_alignment )
    reoriented_csv = orient_amp_analysis( csv_file, raw_alignment )
    selected = extract_alleles( reoriented, alignment_file=raw_alignment )
    selected_csv = subset_amp_analysis( reoriented_csv, selected )
    gDNA_alignment = full_align_best_reference( selected, genomic_reference )
    cDNA_file = extract_cDNA( selected, exon_fofn, alignment_file=gDNA_alignment )
    cDNA_alignment = align_by_identity( cDNA_file, cDNA_reference )
    summarize_typing( gDNA_alignment, cDNA_alignment )
Ejemplo n.º 15
0
def type_sequences(input_folder, exon_fofn, genomic_reference, cDNA_reference):
    """
    Pick the top N Amplicon Analysis consensus seqs from a Fasta by Nreads
    """
    sequence_file = os.path.join(input_folder, 'amplicon_analysis.fastq')
    csv_file = os.path.join(input_folder, 'amplicon_analysis.csv')
    # First we align the sequences to the reference and annotate typing
    raw_alignment = align_best_reference(sequence_file, genomic_reference)
    reoriented = orient_sequences(sequence_file, alignment_file=raw_alignment)
    reoriented_csv = orient_amp_analysis(csv_file, raw_alignment)
    selected = extract_alleles(reoriented, alignment_file=raw_alignment)
    selected_csv = subset_amp_analysis(reoriented_csv, selected)
    gDNA_alignment = full_align_best_reference(selected, genomic_reference)
    cDNA_file = extract_cDNA(selected,
                             exon_fofn,
                             alignment_file=gDNA_alignment)
    cDNA_alignment = align_by_identity(cDNA_file, cDNA_reference)
    summarize_typing(gDNA_alignment, cDNA_alignment)
Ejemplo n.º 16
0
def create_chimeras( input_file, output=None, reference_file=None, alignment_file=None ):
    """
    Pick the top N Amplicon Analysis consensus seqs from a Fasta by Nreads
    """
    # Check the input files, and align the input file if needed
    if reference_file and alignment_file is None:
        alignment_file = align_best_reference( input_file, reference_file )
    elif reference_file is None and alignment_file is None:
        msg = "extract_alleles requires either an Alignment or a Reference!"
        log.error( msg )
        raise IOError( msg )
    # Set the output file if not specified
    if output is None:
        basename = '.'.join( input_file.split('.')[:-1] )
        output = '%s.chimeras.fasta' % basename
    # Parse the alignment data and extract the target sequences
    alignments = list( BlasrReader( alignment_file ))
    groups = _group_by_locus( alignments )
    groups = _filter_groups( groups )
    sequences = list( FastaReader( input_file ))
    chimeras = list( _create_chimeras( groups, sequences ))
    write_fasta( chimeras, output )
    return output