예제 #1
0
def main():
    parser = argparse.ArgumentParser( description='Export the transcript from each PASA cluster with the longest ORF')
    parser.add_argument('-if', '--input_pasa_fasta', type=str, required=True, help='Path to PASAs predicted FASTA file' )
    parser.add_argument('-itsc', '--input_pasa_transcript_size_cutoff', type=str, required=False, default=300, help='Nucleotide size cutoff of input transcripts' )
    parser.add_argument('-itc', '--input_pasa_tpm_cutoff', type=float, required=False, default=0.1, help='TPM cutoff of input transcripts' )
    parser.add_argument('-ig', '--input_pasa_gtf', type=str, required=True, help='Path to PASAs predicted GFT file' )
    parser.add_argument('-it', '--input_transdecoder', type=str, required=True, help='Path to the longest_orfs.cds from transdecoder' )
    parser.add_argument('-is', '--input_salmon', type=str, required=True, help='Path to quant.sf file from Salmon' )
    args = parser.parse_args()

    seqs = utils.fasta_dict_from_file(args.input_pasa_fasta)
    pasa_clusters = load_pasa_clusters(seqs, args.input_pasa_gtf)
    orf_lengths = longest_orf_lengths(args.input_transdecoder)
    load_abundances(seqs, args.input_salmon)

    pasa_cluster_count = len(pasa_clusters)
    pasa_transcript_count = len(seqs)
    size_filtered_pasa_transcript_count = 0
    size_filtered_pasa_transcript_count_with_cds = 0
    size_filtered_pasa_cluster_count = 0
    tpm_filtered_pasa_transcript_count = 0
    tpm_filtered_pasa_cluster_count = 0

    print("Stats:\n")
    print("Initial PASA cluster count: {0}".format(pasa_cluster_count))
    print("Initial PASA transcript count: {0}".format(pasa_transcript_count))
    
    seqs, pasa_clusters = apply_orf_filter(seqs, pasa_clusters, orf_lengths, 100)
    write_unigenes(seqs, pasa_clusters, 'pasa.orf_filtered', orf_lengths)
    
    seqs, pasa_clusters = apply_abundance_filter(seqs, pasa_clusters, args.input_pasa_tpm_cutoff)
    write_unigenes(seqs, pasa_clusters, 'pasa.orf_and_tpm_0.03_filtered', orf_lengths)
def process_fasta(mols, fasta_file):
    fasta_seqs = utils.fasta_dict_from_file(fasta_file)

    for mol_id in mols:
        # check if the FASTA file provides sequence for this
        if mol_id in fasta_seqs:
            mols[mol_id] = fasta_seqs[mol_id]['s']
def main():
    parser = argparse.ArgumentParser( description='Transforms a tab-delimited annotation file to PathoLogic format')

    ## output file to be written
    parser.add_argument('-a', '--annotation_tab', type=str, required=True, help='Path to an input file to be parsed' )
    parser.add_argument('-g', '--genomic_fasta', type=str, required=True, help='Underlying nucleotide FASTA file for the annotated proteins' )
    parser.add_argument('-p', '--protein_fasta', type=str, required=True, help='Protein input sequences to the pipeline, with specific headers required' )
    parser.add_argument('-o', '--output_dir', type=str, required=True, help='Path to an output file to be created' )
    args = parser.parse_args()

    molecules = utils.fasta_dict_from_file(args.genomic_fasta)
    protein_coords = get_protein_coordinates_from_FASTA(args.protein_fasta)

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)
    
    create_subdirectories(molecules, args.output_dir)
    write_elements_file(molecules, args.output_dir)
    write_seq_files(molecules, args.output_dir)

    genes = dict()
    
    for line in open(args.annotation_tab):
        if line.startswith("#"):
            continue

        parse_annotation_line( line, genes, molecules )
        
    write_annotation_files( genes, molecules, protein_coords, args.output_dir )
예제 #4
0
def main():
    parser = argparse.ArgumentParser( description='Reports base/residue composition of a FASTA file')

    ## output file to be written
    parser.add_argument('-f', '--fasta_file', type=str, required=True, help='Path to an input FASTA file' )
    parser.add_argument('-o', '--output_file', type=str, required=False, default=None, help='Optional Path to an output file to be created' )
    args = parser.parse_args()

    ## output will either be a file or STDOUT
    fout = sys.stdout
    if args.output_file is not None:
        fout = open(args.output_file, 'wt')
    
    seqs = utils.fasta_dict_from_file(args.fasta_file)
    
    total_residues = 0
    total_residue_counts = Counter()

    for seq_id in seqs:
        total_residues += len(seqs[seq_id]['s'])
        seq_residue_counts = Counter(seqs[seq_id]['s'])

        for residue in seq_residue_counts:
            total_residue_counts[residue] += seq_residue_counts[residue]
        
                  
    fout.write("# Total residues found: {0}\n".format(total_residues))

    for residue in total_residue_counts:
        residue_count = total_residue_counts[residue]
        residue_perc  = (residue_count / total_residues) * 100
        fout.write("{0}\t{1}\t{2}\n".format(residue, residue_count, residue_perc))
예제 #5
0
def main():
    parser = argparse.ArgumentParser( description='Read a multi-FASTA file sequence and remove duplicates (by MD5 hash)')

    ## output file to be written
    parser.add_argument('-i', '--fasta_file', type=str, required=True, help='Path to an input FASTA file' )
    parser.add_argument('-o', '--output_file', type=str, required=False, default=None, help='Optional Path to an output file to be created' )
    args = parser.parse_args()

    ## output will either be a file or STDOUT
    fout = sys.stdout
    if args.output_file is not None:
        fout = open(args.output_file, 'wt')
    
    seqs = utils.fasta_dict_from_file(args.fasta_file)
    found = dict()

    m = hashlib.md5()

    for seq_id in seqs:
        seq = seqs[seq_id]['s']
        m.update(seq.encode())
        md5sum = m.hexdigest()
        
        ## write this sequence, 60bp per line
        if md5sum not in found:
            fout.write(">{0} {1}\n".format(seq_id, seqs[seq_id]['h']))
            for i in range(0, len(seq), 60):
                fout.write(seq[i : i + 60] + "\n")
            found[md5sum] = 1
def main():
    parser = argparse.ArgumentParser( description='Replaces long homopolymeric stretches with N characters')

    parser.add_argument('-i', '--input', type=str, required=True, help='Path to an input FASTA file' )
    parser.add_argument('-o', '--output', type=str, required=False, help='Path to an output FASTA file to be created' )
    parser.add_argument('-hll', '--homopolymer_length_limit', type=int, required=True, help='Stretches of non-N residues longer than this will be replaced with Ns' )
    args = parser.parse_args()

    if args.output is None:
        out_fh = sys.stdout
    else:
        out_fh = open( args.output, 'wt' )

    sys.stderr.write("INFO: Parsing input FASTA\n")
    sys.stderr.flush()
    seqs = utils.fasta_dict_from_file(args.input)

    sys.stderr.write("INFO: Looking for homopolymeric runs > {0} bp\n".format(args.homopolymer_length_limit))
    sys.stderr.flush()
    for seq_id in seqs:
        seq = seqs[seq_id]
        current_seq = seq['s']
        current_homopolymer_base = None
        current_homopolymer_length = 0
        current_homopolymer_start_idx = 0
        base_index = 0

        for base in list(seq['s']):
            if base == current_homopolymer_base:
                current_homopolymer_length += 1
            else:
                if current_homopolymer_length > args.homopolymer_length_limit and current_homopolymer_base != 'N':
                    sys.stderr.write("WARNING: Replacing {3} bp of {2}s in Sequence ID {0} starting at position {1}\n".format(
                        seq_id, current_homopolymer_start_idx + 1, current_homopolymer_base, current_homopolymer_length))
                    sys.stderr.flush()

                    current_seq = "{0}{1}{2}".format(seq['s'][0:current_homopolymer_start_idx],
                                                     'N' * current_homopolymer_length,
                                                     seq['s'][base_index:])

                current_homopolymer_base = base
                current_homopolymer_length = 1
                current_homopolymer_start_idx = base_index

            base_index += 1

        ## check after the last row for any runs which terminate the sequence
        if current_homopolymer_length > args.homopolymer_length_limit and current_homopolymer_base != 'N':
             sys.stderr.write("WARNING: Replacing {3} bp of {2} bases in Sequence ID {0} starting at position {1}\n".format(
                 seq_id, current_homopolymer_start_idx, current_homopolymer_base, current_homopolymer_length))
             sys.stderr.flush()

             current_seq = "{0}{1}{2}".format(current_seq[0:current_homopolymer_start_idx],
                                              'N' * current_homopolymer_length,
                                              current_seq[base_index:])

        seqs[seq_id]['s'] = current_seq
        out_fh.write(">{0} {1}\n".format(seq_id, seqs[seq_id]['h']))
        out_fh.write(utils.wrapped_fasta(seqs[seq_id]['s']))
        out_fh.write("\n")
def process_fasta(mols, fasta_file):
    fasta_seqs = utils.fasta_dict_from_file(fasta_file)

    for mol_id in mols:
        # check if the FASTA file provides sequence for this
        if mol_id in fasta_seqs:
            mols[mol_id] = fasta_seqs[mol_id]['s']
예제 #8
0
def main():
    parser = argparse.ArgumentParser( description='Merge masked FASTA files')

    ## output file to be written
    parser.add_argument('fasta_files', metavar='N', type=str, nargs='+', help='Pass one or more FASTA files')
    parser.add_argument('-o', '--output_file', type=str, required=False, help='Path to an output file to be created' )
    args = parser.parse_args()

    ## output will either be a file or STDOUT
    ofh = sys.stdout
    if args.output_file is not None:
        ofh = open(args.output_file, 'wt')

    files = args.fasta_files

    # pull off a file and index it
    seqs = utils.fasta_dict_from_file(files.pop())

    # python strings are immutable, so we need to transform these into lists
    for seq_id in seqs:
        seqs[seq_id]['s'] = list(seqs[seq_id]['s'])

    for fasta_file in args.fasta_files:
        new_seqs = utils.fasta_dict_from_file(fasta_file)

        for seq_id in new_seqs:
            # make sure it exists in the source file
            if seq_id not in seqs:
                raise Exception("ERROR: Seq ID {0} was found in file {1} but not in the seed file".format(seq_id, fasta_file) )

            # they should also be the same length
            if len(seqs[seq_id]) != len(new_seqs[seq_id]):
                raise Exception("ERROR: Seq ID {0} was found in {1} and the seed file but had different lengths".format(seq_id, fasta_file))

            i = 0
            for base in new_seqs[seq_id]['s']:
                if base != seqs[seq_id]['s'][i]:
                    if base == 'N':
                        seqs[seq_id]['s'][i] = 'N'
                    elif seqs[seq_id]['s'][i] != 'N':
                        print("WARNING: Disagreement {0}-{1} at position {2}".format(base, seqs[seq_id]['s'][i], i) )

                i += 1

    # now done, print out the results
    for seq_id in seqs:
        ofh.write( ">{0} {1}\n{2}\n".format(seq_id, seqs[seq_id]['h'], utils.wrapped_fasta(''.join(seqs[seq_id]['s']))))
def main():
    parser = argparse.ArgumentParser(
        description=
        'Reverse or reverse-complement selected sequences within a multi-FASTA'
    )

    ## output file to be written
    parser.add_argument('-f',
                        '--fasta_file',
                        type=str,
                        required=True,
                        help='Path to an input FASTA file')
    parser.add_argument('-i',
                        '--id_file',
                        type=str,
                        required=True,
                        help='Path to file with IDs to process')
    parser.add_argument(
        '-a',
        '--action',
        type=str,
        required=True,
        choices=['reverse', 'revcomp'],
        help='What should be done to the sequences in the ID file')
    parser.add_argument('-o',
                        '--output_file',
                        type=str,
                        required=False,
                        default=None,
                        help='Optional Path to an output file to be created')
    args = parser.parse_args()

    ## output will either be a file or STDOUT
    fout = sys.stdout
    if args.output_file is not None:
        fout = open(args.output_file, 'wt')

    seqs = utils.fasta_dict_from_file(args.fasta_file)

    ids = list()

    for line in open(args.id_file):
        line = line.rstrip()
        ids.append(line)

    for seq_id in seqs:
        seq = seqs[seq_id]

        if seq_id in ids:
            if args.action == 'reverse':
                seq['s'] = seq['s'][::-1]
            elif args.action == 'revcomp':
                seq['s'] = utils.reverse_complement(seq['s'])

        ## write this sequence, 60bp per line
        fout.write(">{0}\n".format(seq_id))
        for i in range(0, len(seq['s']), 60):
            fout.write(seq['s'][i:i + 60] + "\n")
예제 #10
0
def process_assembly_fasta(mols, fasta_file):
    fasta_seqs = utils.fasta_dict_from_file(fasta_file)

    for mol_id in mols:
        # check if the FASTA file provides sequence for this
        if mol_id in fasta_seqs:
            mol = mols[mol_id]
            mol.residues = fasta_seqs[mol_id]['s']
            mol.length = len(mol.residues)
예제 #11
0
def process_assembly_fasta(mols, fasta_file):
    fasta_seqs = utils.fasta_dict_from_file(fasta_file)

    for mol_id in mols:
        # check if the FASTA file provides sequence for this
        if mol_id in fasta_seqs:
            mol = mols[mol_id]
            mol.residues = fasta_seqs[mol_id]['s']
            mol.length   = len(mol.residues)
예제 #12
0
def main():
    parser = argparse.ArgumentParser(
        description="Reverse or reverse-complement selected sequences within a multi-FASTA"
    )

    ## output file to be written
    parser.add_argument("-f", "--fasta_file", type=str, required=True, help="Path to an input FASTA file")
    parser.add_argument("-i", "--id_file", type=str, required=True, help="Path to file with IDs to process")
    parser.add_argument(
        "-a",
        "--action",
        type=str,
        required=True,
        choices=["reverse", "revcomp"],
        help="What should be done to the sequences in the ID file",
    )
    parser.add_argument(
        "-o",
        "--output_file",
        type=str,
        required=False,
        default=None,
        help="Optional Path to an output file to be created",
    )
    args = parser.parse_args()

    ## output will either be a file or STDOUT
    fout = sys.stdout
    if args.output_file is not None:
        fout = open(args.output_file, "wt")

    seqs = utils.fasta_dict_from_file(args.fasta_file)

    ids = list()

    for line in open(args.id_file):
        line = line.rstrip()
        ids.append(line)

    for seq_id in seqs:
        seq = seqs[seq_id]

        if seq_id in ids:
            if args.action == "reverse":
                seq["s"] = seq["s"][::-1]
            elif args.action == "revcomp":
                seq["s"] = utils.reverse_complement(seq["s"])

        ## write this sequence, 60bp per line
        fout.write(">{0}\n".format(seq_id))
        for i in range(0, len(seq["s"]), 60):
            fout.write(seq["s"][i : i + 60] + "\n")
예제 #13
0
def main():
    parser = argparse.ArgumentParser( description='Extract regions from a multi-FASTA file')

    ## output file to be written
    parser.add_argument('-f', '--fasta_file', type=str, required=True, help='Path to an input FASTA file' )
    parser.add_argument('-c', '--coords_file', type=str, required=True, help='Path to a tab-delimited file with coordinates' )
    parser.add_argument('-m', '--mol_col', type=int, required=True, help='Tabdel file column with molecule identifiers' )
    parser.add_argument('-x', '--start_coord_col', type=int, required=True, help='Tabdel file column with coordinate start positions' )
    parser.add_argument('-y', '--stop_coord_col', type=int, required=True, help='Tabdel file column with coordinate stop positions' )
    parser.add_argument('-n', '--name_col', type=int, required=False, default=None, help='Optional tabdel file column with name for exported fragment' )
    parser.add_argument('-o', '--output_file', type=str, required=False, default=None, help='Optional Path to an output file to be created' )
    args = parser.parse_args()

    ## output will either be a file or STDOUT
    fout = sys.stdout
    if args.output_file is not None:
        fout = open(args.output_file, 'wt')
    
    seqs = utils.fasta_dict_from_file(args.fasta_file)

    start_col = args.start_coord_col - 1
    stop_col  = args.stop_coord_col - 1
    mol_col   = args.mol_col - 1

    for line in open(args.coords_file):
        line = line.rstrip()
        cols = line.split('\t')

        if len(cols) < 3:
            continue

        (fmin, fmax, strand) = utils.humancoords_to_0interbase(int(cols[start_col]), int(cols[stop_col]))
        mol_id = cols[mol_col]

        if mol_id not in seqs:
            raise Exception("ERROR: molecule ID ({0}) not found in FASTA file".format(mol_id))

        seq = seqs[mol_id]['s'][fmin:fmax]

        seq_id = None
        if args.name_col is None:
            seq_id = "{0}___{1}.{2}.{3}".format( mol_id, fmin, fmax, strand  )
        else:
            seq_id = cols[int(args.name_col) - 1]

        if strand == -1:
            seq = utils.reverse_complement(seq)
        
        ## write this sequence, 60bp per line
        fout.write(">{0}\n".format(seq_id))
        for i in range(0, len(seq), 60):
            fout.write(seq[i : i + 60] + "\n")
예제 #14
0
def main():
    parser = argparse.ArgumentParser(
        description=
        'Transforms a tab-delimited annotation file to PathoLogic format')

    ## output file to be written
    parser.add_argument('-a',
                        '--annotation_tab',
                        type=str,
                        required=True,
                        help='Path to an input file to be parsed')
    parser.add_argument(
        '-g',
        '--genomic_fasta',
        type=str,
        required=True,
        help='Underlying nucleotide FASTA file for the annotated proteins')
    parser.add_argument(
        '-p',
        '--protein_fasta',
        type=str,
        required=True,
        help=
        'Protein input sequences to the pipeline, with specific headers required'
    )
    parser.add_argument('-o',
                        '--output_dir',
                        type=str,
                        required=True,
                        help='Path to an output file to be created')
    args = parser.parse_args()

    molecules = utils.fasta_dict_from_file(args.genomic_fasta)
    protein_coords = get_protein_coordinates_from_FASTA(args.protein_fasta)

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    create_subdirectories(molecules, args.output_dir)
    write_elements_file(molecules, args.output_dir)
    write_seq_files(molecules, args.output_dir)

    genes = dict()

    for line in open(args.annotation_tab):
        if line.startswith("#"):
            continue

        parse_annotation_line(line, genes, molecules)

    write_annotation_files(genes, molecules, protein_coords, args.output_dir)
예제 #15
0
def get_protein_coordinates_from_FASTA(protein_fasta):
    '''
    This function is probably only relevant to a limited number of tasks where the protein input
    FASTA file to the pipeline was produced by transdecoder, which incorporates the predicted
    ORF coordinates into the FASTA header, like this:

    >m.13585 g.13585  ORF g.13585 m.13585 type:3prime_partial len:76 (+) comp100033_c0_seq1:118-348(+)

    Of all that, the only part we care about is that the first model number 'm.13585' matches that
    of the second column in the annotation file, along with the matching genomic molecule name at
    the end of the header 'comp100033_c0_seq1:118-348(+)

    Returns a dict keyed on the model name (like 'm.13585') with the followed keyed values:
       'mol'    = molecule ID (like 'comp100033_c0_seq1')
       'fmin'   = 0-interbase start coordinate (117, from example above)
       'fmax'   = 0-interbase stop coordinate (348, from example above)
       'strand' = 1, 0 or -1 direction
    '''
    protein_locs = dict()

    fasta_dict = utils.fasta_dict_from_file(protein_fasta)

    pattern = re.compile('(comp\d+_c\d+_seq\d+)\:(\d+)\-(\d+)\(\+\)')

    for model_id in fasta_dict:
        if model_id in protein_locs:
            raise Exception(
                "ERROR: found duplicate model ID in file: {0}".format(
                    protein_fasta))

        m = pattern.search(fasta_dict[model_id]['h'])

        if m:
            protein_locs[model_id] = {
                'mol': m.group(1),
                'fmin': int(m.group(2)) - 1,
                'fmax': int(m.group(3)),
                'strand': 1
            }
        else:
            raise Exception(
                "ERROR: unexpected header format.  Expected to parse something like comp100033_c0_seq1:118-348(+).  Got: {0}"
                .format(fasta_dict[model_id]['h']))

    return protein_locs
def initialize_polypeptides( log_fh, fasta_file ):
    '''
    Reads a FASTA file of (presumably) polypeptide sequences and creates a dict of Polypeptide
    objects, keyed by ID, with bioannotation.FunctionalAnnotation objects attached.
    '''
    seqs = utils.fasta_dict_from_file(fasta_file)

    polypeptides = dict()

    for seq_id in seqs:
        polypeptide = things.Polypeptide(id=seq_id, length=len(seqs[seq_id]['s']), residues=seqs[seq_id]['s'])
        annot = annotation.FunctionalAnnotation(product_name=DEFAULT_PRODUCT_NAME)
        log_fh.write("INFO: {0}: Set initial product name to '{1}'\n".format(seq_id, DEFAULT_PRODUCT_NAME))
        polypeptide.annotation = annot
        
        polypeptides[seq_id] = polypeptide
    
    return polypeptides
def main():
    parser = argparse.ArgumentParser(
        description=
        'Read a multi-FASTA file sequence and remove duplicates (by MD5 hash)')

    ## output file to be written
    parser.add_argument('-i',
                        '--fasta_file',
                        type=str,
                        required=True,
                        help='Path to an input FASTA file')
    parser.add_argument('-o',
                        '--output_file',
                        type=str,
                        required=False,
                        default=None,
                        help='Optional Path to an output file to be created')
    args = parser.parse_args()

    ## output will either be a file or STDOUT
    fout = sys.stdout
    if args.output_file is not None:
        fout = open(args.output_file, 'wt')

    seqs = utils.fasta_dict_from_file(args.fasta_file)
    found = dict()

    m = hashlib.md5()

    for seq_id in seqs:
        seq = seqs[seq_id]['s']
        m.update(seq.encode())
        md5sum = m.hexdigest()

        ## write this sequence, 60bp per line
        if md5sum not in found:
            fout.write(">{0} {1}\n".format(seq_id, seqs[seq_id]['h']))
            for i in range(0, len(seq), 60):
                fout.write(seq[i:i + 60] + "\n")
            found[md5sum] = 1
예제 #18
0
def main():
    parser = argparse.ArgumentParser( description='Split multi-FASTA file into separate protein and nucleotide files')

    ## output file to be written
    parser.add_argument('-i', '--input', type=str, required=True, help='Path to an input FASTA file' )
    parser.add_argument('-p', '--protein', type=str, required=False, help='Path to a tab-delimited file with coordinates' )
    parser.add_argument('-n', '--nucleotide', type=str, required=False, help='Tabdel file column with molecule identifiers' )
    parser.add_argument('-c', '--cutoff', type=str, required=False, default=80, help='Min percent (1-100) of ATGCNX content to be considered a nucleotide sequence' )
    args = parser.parse_args()

    pout = nout = None
    
    if args.protein is not None:
        pout = open(args.protein, 'wt')

    if args.nucleotide is not None:
        nout = open(args.nucleotide, 'wt')

    ## the user should have specified at least one
    if pout is None and nout is None:
        raise Exception("ERROR: you must specify either -p or -n options (else why are you running this script?")
    
    seqs = utils.fasta_dict_from_file(args.input)

    for seq_id in seqs:
        seq = seqs[seq_id]
        seqcomp = nucleotide_composition( seq['s'] )
        seq_wrapped = wrapped(seq['s'], every=60)

        if seqcomp >= args.cutoff:
            ## it's a nucleotide
            if nout is not None:
                nout.write(">{0} {1}\n{2}\n".format(seq_id, seq['h'], seq_wrapped ) )

        else:
            ## it's a protein
            if pout is not None:
                pout.write(">{0} {1}\n{2}\n".format(seq_id, seq['h'], seq_wrapped ) )
def main():
    parser = argparse.ArgumentParser( description='Reformats a FASTA file such that there are no more than -w characters of sequence residues per line.')

    ## output file to be written
    parser.add_argument('-i', '--input', type=str, required=True, help='Path to an input FASTA file' )
    parser.add_argument('-w', '--width', type=int, required=False, default=60, help='Width - number of residues per line' )
    parser.add_argument('-o', '--output', type=str, required=False, help='Output file to be created.  Default = STDOUT' )
    parser.add_argument('-uc', '--upper_case', action='store_true', required=False, help='Forces all bases to be upper-case' )
    args = parser.parse_args()

    ## output will either be a file or STDOUT
    fout = sys.stdout
    if args.output is not None:
        fout = open(args.output, 'wt')

    seqs = utils.fasta_dict_from_file(args.input)

    for seq_id in seqs:
        if args.upper_case == True:
            seqs[seq_id]['s'] = seqs[seq_id]['s'].upper()
            
        seq_wrapped = utils.wrapped_fasta(seqs[seq_id]['s'], every=args.width)
        fout.write(">{0} {1}\n{2}\n".format(seq_id, seqs[seq_id]['h'], seq_wrapped))
def main():
    parser = argparse.ArgumentParser( description='Creates a GFF3 file from a genomic FASTA')

    ## output file to be written
    parser.add_argument('-i', '--input_fasta', type=str, required=True, help='Path to an input file to be read' )
    parser.add_argument('-o', '--output_gff3', type=str, required=True, help='Path to an output file to be created' )
    parser.add_argument('-s', '--source', type=str, required=True, help='Source, fills column 2 of the output GFF3 file' )
    parser.add_argument('-t', '--molecule_term', type=str, required=False, default='contig', help='SO term to represent genomic sequence type')
    args = parser.parse_args()

    ofh = open(args.output_gff3, 'wt')
    seqs = utils.fasta_dict_from_file(args.input_fasta)

    # header
    ofh.write("##gff-version 3\n")

    for seq_id in seqs:
        ofh.write("{0}\t{1}\t{2}\t1\t{3}\t.\t.\t.\tID={0}".format(seq_id, args.source, args.molecule_term, len(seqs[seq_id]['s'])))
        
        if len(seqs[seq_id]['h']) > 0:
            ofh.write(";Name={0}".format(seqs[seq_id]['h']))

        ofh.write("\n")
예제 #21
0
def main():
    parser = argparse.ArgumentParser( description='Reports longest ORF length in all frames')

    ## output file to be written
    parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to an input file to be read' )
    args = parser.parse_args()

    seqs = utils.fasta_dict_from_file(args.input_file)

    for seq_id in seqs:
        winning_frame = None
        winning_frame_length = 0

        for frame in range(1, 4):
            offset = frame - 1
            seq = seqs[seq_id]['s'][offset:]
            polyseq = utils.translate(seq)

            longest_len = 0
            current_len = 0
            
            for base in polyseq:
                if base == '*':
                    if current_len > longest_len:
                        longest_len = current_len
                        current_len = 0
                else:
                    current_len += 1

            if current_len > longest_len:
                longest_len = current_len

            if longest_len > winning_frame_length:
                winning_frame = frame
                winning_frame_length = longest_len

        print("{0}\t{1}\t{2}".format(seq_id, winning_frame, winning_frame_length))
def get_protein_coordinates_from_FASTA(protein_fasta):
    '''
    This function is probably only relevant to a limited number of tasks where the protein input
    FASTA file to the pipeline was produced by transdecoder, which incorporates the predicted
    ORF coordinates into the FASTA header, like this:

    >m.13585 g.13585  ORF g.13585 m.13585 type:3prime_partial len:76 (+) comp100033_c0_seq1:118-348(+)

    Of all that, the only part we care about is that the first model number 'm.13585' matches that
    of the second column in the annotation file, along with the matching genomic molecule name at
    the end of the header 'comp100033_c0_seq1:118-348(+)

    Returns a dict keyed on the model name (like 'm.13585') with the followed keyed values:
       'mol'    = molecule ID (like 'comp100033_c0_seq1')
       'fmin'   = 0-interbase start coordinate (117, from example above)
       'fmax'   = 0-interbase stop coordinate (348, from example above)
       'strand' = 1, 0 or -1 direction
    '''
    protein_locs = dict()

    fasta_dict = utils.fasta_dict_from_file(protein_fasta)

    pattern = re.compile('(comp\d+_c\d+_seq\d+)\:(\d+)\-(\d+)\(\+\)')

    for model_id in fasta_dict:
        if model_id in protein_locs:
            raise Exception("ERROR: found duplicate model ID in file: {0}".format(protein_fasta) )

        m = pattern.search(fasta_dict[model_id]['h'])

        if m:
            protein_locs[model_id] = { 'mol': m.group(1), 'fmin': int(m.group(2)) - 1, 'fmax': int(m.group(3)), 'strand': 1}
        else:
            raise Exception("ERROR: unexpected header format.  Expected to parse something like comp100033_c0_seq1:118-348(+).  Got: {0}".format(fasta_dict[model_id]['h']))

    return protein_locs
def main():
    parser = argparse.ArgumentParser( description='Generates a figure showing coverage/abundance vs. molecule size.')

    ## output file to be written
    parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to an input pileup file' )
    parser.add_argument('-f', '--fasta_file', type=str, required=True, help='Path to the FASTA file of reference molecules' )
    parser.add_argument('-o', '--output_file', type=str, required=True, help='Path to an output file to be created' )
    parser.add_argument('-s', '--mol_size_limit', type=int, required=False, default=5000, help='Results for molecules over this size will be grouped together' )
    parser.add_argument('-b', '--mol_bin_size', type=int, required=False, default=10, help='Set the binning resolution of the transcript size axis')
    args = parser.parse_args()

    ## first, we need a collection of the FASTA data and the molecule lengths
    molecules = utils.fasta_dict_from_file(args.fasta_file)

    ## data points for plotting
    #  structure like this:
    #    500 = { 30 => 2 }
    #  which means: There were 2 transcripts with median coverage of 30 and length between 500 and 500+mol_bin_size
    data_bins = defaultdict(lambda: defaultdict(int))

    current_molecule_id = None
    current_molecule_coverages = list()

    ## These files are usually huge.  For scalability, operations performed within this
    #  loop should be limited.
    for line in open(args.input_file):
        cols = line.split("\t")

        if current_molecule_id is None:
            current_molecule_id = cols[0]
            current_molecule_coverages = [0] * len(molecules[cols[0]]['s'])

        if cols[0] != current_molecule_id:
            mol_length_bin = int(len(molecules[current_molecule_id]['s']) / args.mol_bin_size)
            median_size = np.median(current_molecule_coverages)
            data_bins[mol_length_bin][median_size] += 1
            print("DEBUG: molecule {0} appeared to be {1} bp in length with median coverage of {2}".format(current_molecule_id, len(molecules[current_molecule_id]['s']), median_size))

            # reset
            current_molecule_id = cols[0]
            current_molecule_coverages = [0] * len(molecules[cols[0]]['s'])

        try:
            current_molecule_coverages[int(cols[1]) - 1] = int(cols[3]) 
        except IndexError:
            print("ERROR: pileup file reports position {0} coverage but transcript {1} is only {2} bp in length".format(cols[1], current_molecule_id, len(molecules[cols[0]]['s'])) )

    # don't forget the last one
    mol_length_bin = int(len(molecules[cols[0]]['s']) / args.mol_bin_size)
    median_size = np.median(current_molecule_coverages)
    data_bins[mol_length_bin][median_size] += 1

    ## now generate the plot data - x,y positions and radii
    x = list()
    y = list()
    r = list()

    for bin_size in data_bins:
        for cov in data_bins[bin_size]:
            x.append(bin_size)
            y.append(cov)
            r.append(data_bins[bin_size][cov])

    plt.xlabel('Molecule length')
    plt.ylabel('Median depth of coverage')
    #plt.xlim(0,2000)
    #plt.ylim(0,500)
    plt.scatter(x, y, s=r, alpha=0.5)

    if args.output_file == 'plot':
        plt.show()
    else:
        plt.savefig(args.output_file)
예제 #24
0
def main():
    parser = argparse.ArgumentParser(
        description=
        'Generates a figure showing coverage/abundance vs. molecule size.')

    ## output file to be written
    parser.add_argument('-i',
                        '--input_file',
                        type=str,
                        required=True,
                        help='Path to an input pileup file')
    parser.add_argument('-f',
                        '--fasta_file',
                        type=str,
                        required=True,
                        help='Path to the FASTA file of reference molecules')
    parser.add_argument('-o',
                        '--output_file',
                        type=str,
                        required=True,
                        help='Path to an output file to be created')
    parser.add_argument(
        '-s',
        '--mol_size_limit',
        type=int,
        required=False,
        default=5000,
        help='Results for molecules over this size will be grouped together')
    parser.add_argument(
        '-b',
        '--mol_bin_size',
        type=int,
        required=False,
        default=10,
        help='Set the binning resolution of the transcript size axis')
    args = parser.parse_args()

    ## first, we need a collection of the FASTA data and the molecule lengths
    molecules = utils.fasta_dict_from_file(args.fasta_file)

    ## data points for plotting
    #  structure like this:
    #    500 = { 30 => 2 }
    #  which means: There were 2 transcripts with median coverage of 30 and length between 500 and 500+mol_bin_size
    data_bins = defaultdict(lambda: defaultdict(int))

    current_molecule_id = None
    current_molecule_coverages = list()

    ## These files are usually huge.  For scalability, operations performed within this
    #  loop should be limited.
    for line in open(args.input_file):
        cols = line.split("\t")

        if current_molecule_id is None:
            current_molecule_id = cols[0]
            current_molecule_coverages = [0] * len(molecules[cols[0]]['s'])

        if cols[0] != current_molecule_id:
            mol_length_bin = int(
                len(molecules[current_molecule_id]['s']) / args.mol_bin_size)
            median_size = np.median(current_molecule_coverages)
            data_bins[mol_length_bin][median_size] += 1
            print(
                "DEBUG: molecule {0} appeared to be {1} bp in length with median coverage of {2}"
                .format(current_molecule_id,
                        len(molecules[current_molecule_id]['s']), median_size))

            # reset
            current_molecule_id = cols[0]
            current_molecule_coverages = [0] * len(molecules[cols[0]]['s'])

        try:
            current_molecule_coverages[int(cols[1]) - 1] = int(cols[3])
        except IndexError:
            print(
                "ERROR: pileup file reports position {0} coverage but transcript {1} is only {2} bp in length"
                .format(cols[1], current_molecule_id,
                        len(molecules[cols[0]]['s'])))

    # don't forget the last one
    mol_length_bin = int(len(molecules[cols[0]]['s']) / args.mol_bin_size)
    median_size = np.median(current_molecule_coverages)
    data_bins[mol_length_bin][median_size] += 1

    ## now generate the plot data - x,y positions and radii
    x = list()
    y = list()
    r = list()

    for bin_size in data_bins:
        for cov in data_bins[bin_size]:
            x.append(bin_size)
            y.append(cov)
            r.append(data_bins[bin_size][cov])

    plt.xlabel('Molecule length')
    plt.ylabel('Median depth of coverage')
    #plt.xlim(0,2000)
    #plt.ylim(0,500)
    plt.scatter(x, y, s=r, alpha=0.5)

    if args.output_file == 'plot':
        plt.show()
    else:
        plt.savefig(args.output_file)
def main():
    parser = argparse.ArgumentParser(description="Put a description of your script here")

    parser.add_argument("-a", "--organism1_annotation", type=str, required=True, help="Annotation GFF for organism 1")
    parser.add_argument(
        "-p", "--organism1_aat_alignments", type=str, required=True, help="Path to AAT GFF3 (match/match_part)"
    )
    parser.add_argument(
        "-aatdb", "--aat_fasta_db", type=str, required=True, help="Path to FASTA database that was used in AAT"
    )
    parser.add_argument(
        "-b",
        "--organism1_blast_alignments",
        type=str,
        required=True,
        help="Path to BLASTp btab file vs.organism 2 proteins",
    )
    parser.add_argument(
        "-be", "--blast_eval_cutoff", type=float, required=False, default=1e-5, help="BLAST e-value cutoff"
    )
    parser.add_argument(
        "-bpi", "--blast_percent_identity_cutoff", type=float, required=False, default=0, help="BLAST %identity cutoff"
    )
    parser.add_argument(
        "-ppc",
        "--aat_percent_coverage_cutoff",
        type=float,
        required=False,
        default=0,
        help="% coverage of the query protein by the AAT match",
    )
    parser.add_argument(
        "-o", "--output_id_list", type=str, required=False, help="List of IDs from organism1 that passed"
    )
    args = parser.parse_args()

    debugging_transcript = None

    ## if the output file wasn't passed build one from the other parameters
    if args.output_id_list is None:
        args.output_id_list = "training_ids.be_{0}.bpi_{1}.ppc_{2}.list".format(
            args.blast_eval_cutoff, args.blast_percent_identity_cutoff, args.aat_percent_coverage_cutoff
        )

    print("INFO: Parsing organism1 annotation")
    (assemblies, features) = gff.get_gff3_features(args.organism1_annotation)

    print("INFO: Parsing AAT FASTA database")
    aat_seqs = utils.fasta_dict_from_file(args.aat_fasta_db)

    # keys are assembly IDs, value for each is a list of matches on them
    aat_matches = dict()
    aat_match_count = 0
    current_match = None

    ## IDs of features in organism 1 which overlap AAT
    o1_with_aat = list()
    o1_with_o2 = list()

    print("INFO: Parsing organism1 AAT protein alignments")
    for line in open(args.organism1_aat_alignments):
        cols = line.split("\t")

        if line.startswith("#") or len(cols) != 9:
            continue

        assembly_id = cols[0]

        # skip this match if there were not predicted genes on the same assembly
        if assembly_id not in assemblies:
            continue

        if assembly_id not in aat_matches:
            aat_matches[assembly_id] = list()

        fmin = int(cols[3]) - 1
        fmax = int(cols[4])
        strand = cols[6]
        feature_id = gff.column_9_value(cols[8], "ID").replace('"', "")
        target = gff.column_9_value(cols[8], "Target")
        m = re.search("^(\S+)", target)
        if m:
            target = m.group(1)

        if cols[2] == "nucleotide_to_protein_match":
            if current_match is not None:
                aat_matches[assembly_id].append(current_match)
                aat_match_count += 1

            current_match = things.Match(
                id=feature_id, target_id=target, subclass="nucleotide_to_protein_match", length=fmax - fmin
            )
            current_match.locate_on(target=assemblies[assembly_id], fmin=fmin, fmax=fmax, strand=strand)

        elif cols[2] == "match_part":
            parent_id = gff.column_9_value(cols[8], "Parent").replace('"', "")
            match_part = things.MatchPart(id=feature_id, parent=parent_id, length=fmax - fmin)
            match_part.locate_on(target=assemblies[assembly_id], fmin=fmin, fmax=fmax, strand=strand)
            current_match.add_part(match_part)

    print("INFO: Parsed {0} protein alignment chains".format(aat_match_count))

    print("INFO: Comparing organism1's mRNAs with AAT match coordinates")
    for assembly_id in assemblies:
        if assembly_id not in aat_matches:
            continue

        assembly = assemblies[assembly_id]

        for gene in assembly.genes():
            for mRNA in gene.mRNAs():

                if debugging_transcript is not None:
                    if mRNA.id == debugging_transcript:
                        print("DEBUG: processing debugging transcript: {0}".format(mRNA.id))
                    else:
                        continue

                for aat_match in aat_matches[assembly_id]:
                    # print("DEBUG: about to call overlap_size_with {0} and {1}, which has {2} segments".format(mRNA.id, aat_match.id, len(aat_match.parts)) )
                    overlap_size = mRNA.overlap_size_with(aat_match)

                    if overlap_size is not None:
                        # print("DEBUG: {0}:({1}) overlaps (size:{2}) {3}:({4})".format(mRNA.id, mRNA.length, overlap_size, aat_match.id, aat_match.length) )
                        # this shouldn't be possible, but check just in case
                        if overlap_size > mRNA.length:
                            raise Exception(
                                "ERROR: overlap size ({0}) > mRNA length ({1})".format(overlap_size, mRNA.length)
                            )

                        if aat_match.target_id not in aat_seqs:
                            raise Exception(
                                "ERROR: Found match with target ID ({0}) but didn't find a FASTA entry for it via -aatdb".format(
                                    aat_match.target_id
                                )
                            )

                        # this is a protein length, so x3
                        match_target_length = len(aat_seqs[aat_match.target_id]["s"]) * 3

                        (mRNA_percent_coverage, target_percent_coverage) = calculate_fragmented_coverage(
                            mRNA, aat_match, match_target_length
                        )

                        # print("DEBUG: mRNA_percent_coverage:{0}".format(mRNA_percent_coverage) )
                        # print("DEBUG: match_percent_coverage:{0}".format(target_percent_coverage) )

                        if (
                            mRNA_percent_coverage >= args.aat_percent_coverage_cutoff
                            and target_percent_coverage >= args.aat_percent_coverage_cutoff
                        ):
                            o1_with_aat.append(mRNA.id)
                            # print("DEBUG: {0}:({1}) overlaps (size:{2}) {3}:({4}), match target id:{5}, length:{6}".format( \
                            #        mRNA.id, mRNA.length, overlap_size, aat_match.id, aat_match.length, \
                            #        aat_match.target_id, match_target_length) )
                            # print("\tmRNA % cov: {0}".format(mRNA_percent_coverage))
                            # print("\ttarget % cov: {0}".format(target_percent_coverage))
                            break  # only need to see if one matched

    print("INFO: Found {0} mRNAs in org1 with overlapping fungi AAT coordinates".format(len(o1_with_aat)))

    # key=org1_transcript_id, value=org2_transcript_id
    top_blast_hits = dict()

    print("INFO: parsing BLAST results vs. org2")
    for line in open(args.organism1_blast_alignments):
        cols = line.split("\t")

        if float(cols[19]) > args.blast_eval_cutoff:
            continue

        if float(cols[10]) < args.blast_percent_identity_cutoff:
            continue

        # if we survived until here, this one's good.
        top_blast_hits[cols[0]] = cols[5]

    print("INFO: Comparing overlap between AAT-matched proteins and BLAST ones")
    for o1_mRNA_id in o1_with_aat:
        if o1_mRNA_id in top_blast_hits:
            o1_with_o2.append(o1_mRNA_id)

    print(
        "INFO: Found {0} mRNAs in org1 with overlapping AAT coordinates and BLAST hit to org2".format(len(o1_with_o2))
    )

    id_list_fh = open(args.output_id_list, "wt")
    for mRNA_id in o1_with_o2:
        id_list_fh.write("{0}\n".format(mRNA_id))
예제 #26
0
def main():
    parser = argparse.ArgumentParser(
        description=
        'Parses nucmer coords output to provide an overall coverage report')

    ## coords file generated with: show-coords -l -r -T out.delta
    parser.add_argument('-c', '--coords_file', type=str, required=True, \
                            help='Path to a nucmer coords file with non-overlapping results (requires -l -r -T options of show-coords)' )
    parser.add_argument(
        '-o',
        '--output_prefix',
        type=str,
        required=True,
        help='Several output files will be created with this prefix.')
    parser.add_argument('-a',
                        '--annotation_file',
                        type=str,
                        required=True,
                        help='Path to a sorted GFF3 annotation file')
    parser.add_argument('-r',
                        '--reference_fasta',
                        type=str,
                        required=True,
                        help='Path to the reference file used with nucmer')
    parser.add_argument(
        '-k',
        '--annotation_key',
        type=str,
        required=False,
        help=
        'Optional.  Key string to look for in the 9th column of the GFF3 file for an annotation string.'
    )
    args = parser.parse_args()

    ## like: h[$assem] = [ {id=?,fmin=?,fmax=?}, ...  ]
    annot = parse_annotation(args.annotation_file, args.annotation_key)

    ## like: [ {id=?,qfmin=?,qfmax=?,rfmin=?,rfmax=?} ]
    query_fragments = []

    ref_molecules = utils.fasta_dict_from_file(args.reference_fasta)
    ref_n_total = 0

    for ref_id in ref_molecules:
        ref_n_total += len(ref_molecules[ref_id]['s'])

    ## open the output files
    genecov_stats_ofh = open(args.output_prefix + ".stats.gene_coverage", "wt")
    genesmissing_list_ofh = open(args.output_prefix + ".list.genes_missing",
                                 "wt")
    refmol_stats_ofh = open(args.output_prefix + ".stats.refmol_coverage",
                            "wt")
    refcov_stats_ofh = open(args.output_prefix + ".tab.refmol_coverage", "wt")
    refext_list_ofh = open(args.output_prefix + ".tab.extensions", "wt")
    genecov_tab_ofh = open(args.output_prefix + ".tab.gene_coverage", "wt")
    refext_list_ofh.write("# {0}\n".format(args.output_prefix))
    refext_list_ofh.write(
        "# reference_id\tref_fmin\tref_fmax\tref_strand\tqry_id\tqry_fmin\tqry_fmax\tqry_strand\tqry_length\n"
    )

    ref_cov_stats = {'n_cov': 0, 'n_uncov': 0, 'n_identical': 0}

    alignment_lines_found = 0
    current_ref_id = None

    for line in open(args.coords_file, 'r'):
        cols = line.split()

        if len(cols) == 11:
            alignment_lines_found += 1
        else:
            continue

        cols[0] = int(cols[0])
        cols[1] = int(cols[1])
        cols[2] = int(cols[2])
        cols[3] = int(cols[3])

        if cols[9] != current_ref_id:
            if current_ref_id is not None:
                if current_ref_id in annot:
                    calculate_gene_coverage_fragments(annot[current_ref_id],
                                                      query_fragments)

                calculate_fragment_coverage(current_ref_id, query_fragments,
                                            current_ref_length, ref_cov_stats,
                                            refcov_stats_ofh, refext_list_ofh)

            ## reset
            current_ref_id = cols[9]
            current_ref_length = int(cols[7])
            query_fragments = []

            ## quick sanity check
            if current_ref_id not in annot:
                print(
                    "WARNING: found a nucleotide accession for which we have no annotation: {0}"
                    .format(current_ref_id))

        qstrand = 1

        if cols[2] > cols[3]:
            qstrand = -1

        fragment = {}
        fragment['id'] = cols[10]
        fragment['qfmin'] = min(cols[2], cols[3]) - 1
        fragment['qfmax'] = max(cols[2], cols[3])
        fragment['qlen'] = int(cols[8])
        fragment['qstrand'] = qstrand
        fragment['rfmin'] = min(cols[0], cols[1]) - 1
        fragment['rfmax'] = max(cols[0], cols[1])
        fragment['rlen'] = int(cols[7])
        fragment['pctid'] = float(cols[6])
        query_fragments.append(fragment)

    ## don't forget the last one
    if current_ref_id is not None:
        if current_ref_id in annot:
            calculate_gene_coverage_fragments(annot[current_ref_id],
                                              query_fragments)

        calculate_fragment_coverage(current_ref_id, query_fragments,
                                    current_ref_length, ref_cov_stats,
                                    refcov_stats_ofh, refext_list_ofh)

    if alignment_lines_found == 0:
        raise Exception("ERROR: failed to find any 11-column alignment lines")
    else:
        print("INFO: {0} alignment lines found".format(alignment_lines_found))

    report_gene_coverage_results(annot, genecov_stats_ofh,
                                 genesmissing_list_ofh, genecov_tab_ofh)

    cov_perc = (ref_cov_stats['n_cov'] / ref_n_total) * 100
    cov_perc_id = (ref_cov_stats['n_identical'] / ref_n_total) * 100
    refmol_stats_ofh.write(
        "Total bases in reference molecules\t{0}\n".format(ref_n_total))
    refmol_stats_ofh.write(
        "Ref bases covered by query fragments\t{0}\n".format(
            ref_cov_stats['n_cov']))
    refmol_stats_ofh.write(
        "Ref % covered by query fragments\t{0:.2f}\n".format(cov_perc))
    refmol_stats_ofh.write(
        "Ref % identity by query fragments\t{0:.2f}\n".format(cov_perc_id))
def main():
    '''
    This script reports statistics on the areas of a genome where features aren't - introns and
    intergenic space.  Pass a valid GFF3 file (along with FASTA data) and get a report like this:

    Molecule count: 9

    Gene count: 4171
    Intergenic space count: 4061
    Average intergenic space distance: 361.7 bp
    Median intergenic space distance: 245 bp
    Minimum intergenic space distance: 0 bp
    Maximum intergenic space distance: 6272 bp

    Intron count: 10533
    Intron space count: 989024
    Average intron size: 93.9 bp
    Median intron size: 63 bp
    Minimum intron size: 2 bp
    Maximum intron size: 1676 bp


    Optionally, you can pass the path to a PNG file to be created using the --histogram parameter,
    which will generate a size distribution histogram with two overlaying plots - one representing
    the distribution of intergenic region sizes and the other the intron lengths.  Because these
    can often have long tails, you can limit both the Y- and X-axes values with the --ylimit and
    --xlimit options, respectively.

    FASTA:
    If your FASTA isn't embedded at the end of your GFF3 file after a ##FASTA directive you'll need
    to specify the --fasta option in this script and pass it as a separate file.

    Definitions:
    Intergenic space was a little ambiguous to me as I started writing this.  Does one count the space from
    the beginning of the contig until the first gene, or only between them?  What about short contigs which
    have no annotated genes at all?  From the Sequence Ontology:

    SO:0000605: A region containing or overlapping no genes that is bounded on either side by a gene, or
    bounded by a gene and the end of the chromosome.

    To my reading, this includes contig ends but not gene-less contigs.  To that end, I include the
    former in intergenic space reporting but include the latter as a separate statistic.

    Author: Joshua Orvis (jorvis AT gmail)
    '''
    parser = argparse.ArgumentParser( description='Reports statistics of reference gene coverage and extension by aligned RNA-seq transcript data.')

    ## output file to be written
    parser.add_argument('-i', '--input_gff3', type=str, required=True, help='GFF3 file of a reference annotation' )
    parser.add_argument('-g', '--histogram', type=str, required=False, help='Optional path to a histogram of intron/intergenic space size distribution to be created (PNG)' )
    parser.add_argument('-x', '--xlimit', type=int, required=False, help='Use this if you want to limit the X-axis of the histogram (feature length)' )
    parser.add_argument('-y', '--ylimit', type=int, required=False, help='Use this if you want to limit the Y-axis of the histogram (feature count)' )
    parser.add_argument('-f', '--fasta', type=str, required=False, help='Required if you don\'t have GFF3 with embedded FASTA')
    args = parser.parse_args()

    (assemblies, features) = gff.get_gff3_features(args.input_gff3)

    if args.fasta is not None:
        seqs = utils.fasta_dict_from_file(args.fasta)
        for seq_id in seqs:
            if seq_id in assemblies:
                assemblies[seq_id].residues = seqs[seq_id]['s']
                assemblies[seq_id].length = len(assemblies[seq_id].residues)

    ## things to keep stats on and report
    total_molecule_count = len(assemblies)
    total_gene_count = 0
    
    ## this number is NOT just the total genes N - 1, since there can be multiple molecules
    #   genes can overlap, etc.
    total_intergenic_space_count = 0
    
    total_intergenic_space_residues = 0
    intergenic_distances = list()

    total_contig_residues = 0
    empty_contig_residues = 0

    total_intron_count = 0
    total_intron_residues = 0
    intron_sizes = list()

    ############################
    ## Calculation section
    ############################

    for asm_id in assemblies:
        #print("DEBUG: processing assembly: {0}".format(asm_id))
        assembly = assemblies[asm_id]
        genes = sorted(assembly.genes())
        total_gene_count += len(genes)
        previous_gene_loc = None

        # we should have a length here
        if assembly.length is None or assembly.length == 0:
            raise Exception("ERROR: Detected assembly with undefined or 0 length: {0}".format(assembly.id))

        if total_gene_count == 0:
            empty_contig_residues += assembly.length
            continue

        total_contig_residues += assembly.length
        first_gene_loc = None
        last_gene_loc = None

        for gene in genes:
            gene_loc = gene.location_on(assembly)

            # if this is the first gene, track the number of bases from the start of the molecule here
            if first_gene_loc is None:
                total_intergenic_space_count += 1
                intergenic_distance = gene_loc.fmin
                total_intergenic_space_residues += intergenic_distance
                intergenic_distances.append(intergenic_distance)
                first_gene_loc = gene_loc

            if previous_gene_loc is not None:
                ## skip this gene if it overlaps the previous
                if gene_loc.fmin < previous_gene_loc.fmax:
                    if gene_loc.fmax > previous_gene_loc.fmax:
                        previous_gene_loc = gene_loc

                else:
                    total_intergenic_space_count += 1
                    intergenic_distance = gene_loc.fmin - previous_gene_loc.fmax
                    total_intergenic_space_residues += intergenic_distance
                    intergenic_distances.append(intergenic_distance)
                    
            for mRNA in gene.mRNAs():
                introns = mRNA.introns( on=assembly )

                for intron in sorted(introns):
                    total_intron_count += 1
                    intron_loc = intron.location_on(assembly)
                    intron_size = intron_loc.fmax - intron_loc.fmin

                    #if intron_size > 0:
                        #print("\tDEBUG: found mRNA:{0} intron {1}-{2} ({3} bp)".format(mRNA.id, intron_loc.fmin, intron_loc.fmax, intron_size))

                    if intron_size < 0:
                        print("\tWARN: Intron size ({1}) < 0 reported in gene {0}".format(gene.id, intron_size))
                    
                    intron_sizes.append(intron_size)
                    total_intron_residues += intron_size
                
            previous_gene_loc = gene_loc
            last_gene_loc = previous_gene_loc
        
        if last_gene_loc is not None:
            total_intergenic_space_count += 1
            intergenic_distance = assembly.length - last_gene_loc.fmax
            total_intergenic_space_residues += intergenic_distance
            intergenic_distances.append(intergenic_distance)

    if total_intergenic_space_count == 0:
        avg_intergenic_space_dist = None
        intergenic_distances = None
        median_int_space_dist = None
    else:
        avg_intergenic_space_dist = total_intergenic_space_residues / total_intergenic_space_count
        intergenic_distances = sorted(intergenic_distances)
        median_int_space_dist = intergenic_distances[ int(len(intergenic_distances)/2) ]

    avg_intron_size = total_intron_residues / total_intron_count
    intron_sizes = sorted(intron_sizes)
    median_intron_size = intron_sizes[int(len(intron_sizes)/2)]
            
    ############################
    ## Reporting section
    ############################

    print("\nMolecule count: {0}".format(total_molecule_count))
    print("Gene count: {0}".format(total_gene_count) )

    print("\nTotal molecule bases: {0} bp".format(total_contig_residues) )
    print("Empty molecule bases: {0} bp".format(empty_contig_residues) )

    if total_intergenic_space_count > 0:
        print("Intergenic space count: {0}".format(total_intergenic_space_count) )
        print("Average intergenic space distance: {0:.1f} bp".format(avg_intergenic_space_dist) )
        print("Median intergenic space distance: {0} bp".format(median_int_space_dist) )
        print("Minimum intergenic space distance: {0} bp".format(intergenic_distances[0]) )
        print("Maximum intergenic space distance: {0} bp\n".format(intergenic_distances[-1]) )
    else:
        print("There were no intergenic spaces found.  This might mean there were no molecules with at least 2 genes.")
 
    print("Intron count: {0}".format(total_intron_count) )
    print("Intron space count: {0}".format(total_intron_residues) )

    print("Average intron size: {0:.1f} bp".format(avg_intron_size) )
    print("Median intron size: {0} bp".format(median_intron_size) )
    print("Minimum intron size: {0} bp".format(intron_sizes[0]) )
    print("Maximum intron size: {0} bp\n".format(intron_sizes[-1]) )
    
    ############################
    ## Graphics section (optional)
    ############################
    if args.histogram is not None:
        import matplotlib.pyplot as plt

        plt.xlabel('length (bp)')
        plt.ylabel('count')
        plt.title('Distribution of intron size and intergenic distances')
        plt.hist(intergenic_distances, bins=50, histtype='stepfilled', color='b', label='Intergenic distances' )
        plt.hist(intron_sizes, bins=50, histtype='stepfilled', color='r', alpha=0.5, label='Intron sizes' )

        if args.xlimit is not None:
            plt.xlim([0, args.xlimit])
        
        if args.ylimit is not None:
            plt.ylim([0, args.ylimit])

        plt.legend(loc='best')
        plt.savefig(args.histogram)
def main():
    parser = argparse.ArgumentParser( description='Put a description of your script here')

    ## output file to be written
    parser.add_argument('-s', '--sam_file', type=str, required=True, help='Input SAM file with reads aligned to reference' )
    parser.add_argument('-fi', '--fasta_in', type=str, required=False, help='Path to a FASTA file representing sequences that were aligned against.  If this is passed, you should also pass the -fo argument' )
    parser.add_argument('-fo', '--fasta_out', type=str, required=False, help='If passed along with -fi, the orientation-corrected sequences will be written here.' )
    args = parser.parse_args()
    seqs = dict()

    if args.fasta_in is not None:
        seqs = utils.fasta_dict_from_file(args.fasta_in)

        if args.fasta_out is not None:
            out_fh = open(args.fasta_out, 'w')
        else:
            raise Exception("ERROR: You must pass a value for -fo if you pass -fi")

    total_read_mappings = 0
    last_transcript_id = None
    counts = { '1':{'T':0,'F':0}, '2':{'T':0,'F':0} }
    transcript_count = 0
    correct_orientation_count = 0
    incorrect_orientation_count = 0

    transcripts_to_correct = dict()

    for line in open(args.sam_file):
        if line.startswith('@'): continue
        
        cols = line.split("\t")
        if len(cols) < 5: continue

        read_dir = cols[0][-1]
        transcript_id = cols[2]
        total_read_mappings += 1

        flag = cols[1]
        if int(flag) & 16:
            seq_revcomped = 'T'
        else:
            seq_revcomped = 'F'

        #print("DEBUG: match:{2}, SEQ_revcomped={0}, read_dir={1}".format(seq_revcomped, read_dir, transcript_id))

        if transcript_id == last_transcript_id:
            counts[read_dir][seq_revcomped] += 1
        else:
            transcript_count += 1
            
            if last_transcript_id is not None:
                ## determine transcript orientation
                ## Given an RF library, the 1:T count should outnumber the 1:F one
                if counts['1']['T'] > counts['1']['F']:
                    correct_orientation_count += 1
                else:
                    incorrect_orientation_count += 1
                    transcripts_to_correct[last_transcript_id] = 1
                
                ## report counts
                print("{0}\t1-T:{1}\t1-F:{2}\t2-T:{3}\t2-F:{4}".format(last_transcript_id, counts['1']['T'], counts['1']['F'], counts['2']['T'], counts['2']['F']))

            ## reset
            last_transcript_id = transcript_id
            counts = { '1':{'T':0,'F':0}, '2':{'T':0,'F':0} }


    for seq_id in seqs:
        seq = seqs[seq_id]

        if seq_id in transcripts_to_correct:
            seq['s'] = utils.reverse_complement(seq['s'])

        out_fh.write(">{0} {2}\n{1}\n".format(seq_id, utils.wrapped_fasta(seq['s']), seq['h']))

    print("Total transcripts: {0}".format(transcript_count))
    print("Total reads mapped: {0}".format(total_read_mappings))
    print("Transcripts in correct orientation: {0}".format(correct_orientation_count))
    print("Transcripts in reverse orientation: {0}".format(incorrect_orientation_count))
예제 #29
0
def main():
    parser = argparse.ArgumentParser(
        description='Provides coverage information for features in a GFF3 file'
    )

    ## output file to be written
    parser.add_argument(
        'evidence_files',
        metavar='N',
        type=str,
        nargs='+',
        help='Path to one or more evidence files, separated by spaces')
    parser.add_argument(
        '-r',
        '--reference',
        type=str,
        required=True,
        help=
        'Input path to the reference GFF3 file. So we know what feature type to report on, format should be like FILE:TYPE'
    )
    parser.add_argument('-f',
                        '--fasta',
                        type=str,
                        required=True,
                        help='Input path to the reference FASTA file.')
    parser.add_argument(
        '-o',
        '--output_file',
        type=str,
        required=False,
        help=
        'Optional path to an output file to be created, else prints on STDOUT')
    args = parser.parse_args()

    ## parse the fasta
    fasta = utils.fasta_dict_from_file(args.fasta)

    ## open the output file
    fout = None
    if args.output_file is None:
        fout = codecs.getwriter('utf8')(sys.stdout.buffer)
    else:
        fout = open(args.output_file, "w")

    ####################################################
    ## Sanity checks

    allowed_extensions = ['bed', 'gff3', 'pileup', 'sam']
    for ev_file in args.evidence_files:
        valid_ext_found = False

        for ext in allowed_extensions:
            if ev_file.endswith(ext):
                valid_ext_found = True

        if valid_ext_found == False:
            raise Exception(
                "ERROR: Evidence file passed with unsupported file extension: {0}.  Supported extensions are {1}"
                .format(ev_file, allowed_extensions))

    ## The input file should be defined as $path:$feattype
    if ':' not in args.reference:
        raise Exception(
            "ERROR: input_file must be like /path/to/some.gff3:mRNA")

    ref_file_parts = args.reference.split(':')
    print("DEBUG: part count: {0}".format(len(ref_file_parts)))

    if ref_file_parts[0].endswith('.gff3'):
        (ref_assemblies,
         ref_features) = gff.get_gff3_features(ref_file_parts[0])
    else:
        raise Exception(
            "ERROR: Expected input file (-i) to have a gff3 extension, got {0}"
            .format(ref_file_parts[0]))

    ####################################################
    ## Initialize the coverage arrays

    fasta_cov = dict()
    for seq_id in fasta:
        # create a list of 0s the length of the molecule
        fasta_cov[seq_id] = [0] * len(fasta[seq_id]['s'])

    ####################################################
    ## Now parse the evidence files

    for ev_file in args.evidence_files:
        if ev_file.endswith('pileup'):
            parse_pileup(fasta_cov, ev_file)
        elif ev_file.endswith('sam'):
            parse_sam(fasta_cov, ev_file)
        else:
            print(
                "INFO: ignoring evidence file {0} because code to handle its file type isn't currently implemented"
                .format(ev_file))

    for id in fasta_cov:
        covered_bases = 0

        for i in fasta_cov[id]:
            if fasta_cov[id][i] > 0:
                covered_bases += 1

        fout.write("{0}\t{1}\t{2}\n".format(id, len(fasta[id]['s']),
                                            covered_bases))
def main():
    parser = argparse.ArgumentParser(
        description=
        'Reports on non-standard characters in multifasta files and can optionally replace residues'
    )

    parser.add_argument('-i',
                        '--input',
                        type=str,
                        required=True,
                        help='Path to an input FASTA file')
    parser.add_argument('-t',
                        '--type',
                        type=str,
                        required=True,
                        choices=('n', 'p'),
                        help='Either n for nucleotide or p for protein')
    parser.add_argument(
        '-o',
        '--output',
        type=str,
        required=False,
        help='Path to an output FASTA file to be created if doing replacement')
    parser.add_argument(
        '-pl',
        '--print_locations',
        dest='print_locations',
        action='store_true',
        help=
        'If passed, will report coordinate of each non-standard residue on STDERR'
    )
    parser.add_argument(
        '-r',
        '--replace',
        type=str,
        required=False,
        help='Replace this character with the one defined by --with_')
    parser.add_argument(
        '-w',
        '--with_',
        type=str,
        required=False,
        help=
        'This character or set replaces all instances of the one found in --replace'
    )
    parser.add_argument(
        '-l',
        '--list',
        type=str,
        required=False,
        help=
        'Optional file of IDs where non-standard residues were detected or replaced'
    )
    parser.add_argument(
        '-g',
        '--ignore',
        type=str,
        required=False,
        default='N*X',
        help=
        'List of characters to not report as non-standard.  Default = the universal ambiguity bases (N, X) or the end-of-translation stop for proteins (*)'
    )
    parser.set_defaults(print_locations=False)
    args = parser.parse_args()

    if args.output is None:
        out_fh = sys.stdout
    else:
        out_fh = open(args.output, 'wt')

    ## if you define --replace, you must also define --with_, and vice versa
    if args.replace is not None and args.with_ is None:
        raise Exception("ERROR: You must pass --with_ when passing --replace")
    if args.with_ is not None and args.replace is None:
        raise Exception("ERROR: You must pass --replace when passing --with_")

    seqs = utils.fasta_dict_from_file(args.input)

    ## standard characters (depends on the type of sequence)
    standard_residues = dict()
    if args.type == 'n':
        for base in list("ATGCU"):
            standard_residues[base] = 1
    else:
        for base in list("ACDEFGHIKLMNPQRSTVWY"):
            standard_residues[base] = 1

    if args.list is not None:
        list_fh = open(args.list, 'wt')

    ## build the lookup of characters to ignore
    ignore_residues = dict()
    for residue in list(args.ignore):
        ignore_residues[residue.upper()] = None

    ## process the sequences
    seqs_with_bad_chars = dict()

    for seq_id in seqs:
        i = 0
        seq = seqs[seq_id]
        bad_chars = dict()

        for base in list(seq['s']):
            i += 1
            ubase = base.upper()
            if ubase not in standard_residues and ubase not in ignore_residues:
                if ubase in bad_chars:
                    bad_chars[ubase] += 1
                else:
                    bad_chars[ubase] = 1

                if args.print_locations == True:
                    print("Molecule {0} contains residue {1} at position {2}".
                          format(seq_id, ubase, i),
                          file=sys.stderr)

        if args.list is not None and len(bad_chars) > 0:
            list_fh.write("{0}".format(seq_id))
            for base in bad_chars:
                list_fh.write("\t{0}:{1}".format(base, bad_chars[base]))

            list_fh.write("\n")

        if args.replace is not None:
            seq['s'] = seq['s'].replace(args.replace, args.with_)
            out_fh.write(">{0} {1}\n".format(seq_id, seq['h']))

            for i in range(0, len(seq['s']), 60):
                out_fh.write(seq['s'][i:i + 60] + "\n")
예제 #31
0
def main():
    parser = argparse.ArgumentParser(
        description='Replaces long homopolymeric stretches with N characters')

    parser.add_argument('-i',
                        '--input',
                        type=str,
                        required=True,
                        help='Path to an input FASTA file')
    parser.add_argument('-o',
                        '--output',
                        type=str,
                        required=False,
                        help='Path to an output FASTA file to be created')
    parser.add_argument(
        '-hll',
        '--homopolymer_length_limit',
        type=int,
        required=True,
        help=
        'Stretches of non-N residues longer than this will be replaced with Ns'
    )
    args = parser.parse_args()

    if args.output is None:
        out_fh = sys.stdout
    else:
        out_fh = open(args.output, 'wt')

    sys.stderr.write("INFO: Parsing input FASTA\n")
    sys.stderr.flush()
    seqs = utils.fasta_dict_from_file(args.input)

    sys.stderr.write("INFO: Looking for homopolymeric runs > {0} bp\n".format(
        args.homopolymer_length_limit))
    sys.stderr.flush()
    for seq_id in seqs:
        seq = seqs[seq_id]
        current_seq = seq['s']
        current_homopolymer_base = None
        current_homopolymer_length = 0
        current_homopolymer_start_idx = 0
        base_index = 0

        for base in list(seq['s']):
            if base == current_homopolymer_base:
                current_homopolymer_length += 1
            else:
                if current_homopolymer_length > args.homopolymer_length_limit and current_homopolymer_base != 'N':
                    sys.stderr.write(
                        "WARNING: Replacing {3} bp of {2}s in Sequence ID {0} starting at position {1}\n"
                        .format(seq_id, current_homopolymer_start_idx + 1,
                                current_homopolymer_base,
                                current_homopolymer_length))
                    sys.stderr.flush()

                    current_seq = "{0}{1}{2}".format(
                        seq['s'][0:current_homopolymer_start_idx],
                        'N' * current_homopolymer_length,
                        seq['s'][base_index:])

                current_homopolymer_base = base
                current_homopolymer_length = 1
                current_homopolymer_start_idx = base_index

            base_index += 1

        ## check after the last row for any runs which terminate the sequence
        if current_homopolymer_length > args.homopolymer_length_limit and current_homopolymer_base != 'N':
            sys.stderr.write(
                "WARNING: Replacing {3} bp of {2} bases in Sequence ID {0} starting at position {1}\n"
                .format(seq_id, current_homopolymer_start_idx,
                        current_homopolymer_base, current_homopolymer_length))
            sys.stderr.flush()

            current_seq = "{0}{1}{2}".format(
                current_seq[0:current_homopolymer_start_idx],
                'N' * current_homopolymer_length, current_seq[base_index:])

        seqs[seq_id]['s'] = current_seq
        out_fh.write(">{0} {1}\n".format(seq_id, seqs[seq_id]['h']))
        out_fh.write(utils.wrapped_fasta(seqs[seq_id]['s']))
        out_fh.write("\n")
def main():
    parser = argparse.ArgumentParser( description='')

    ## output file to be written
    parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to the input GFF3' )
    parser.add_argument('-o', '--output_file', type=str, required=False, help='Path to an output file to be created' )
    parser.add_argument('-f', '--fasta', type=str, required=False, help='Required if you don\'t have GFF3 with embedded FASTA')
    parser.add_argument('-t', '--type', type=str, required=False, default='mRNA', choices=['mRNA', 'CDS'], help='Feature type to export (mRNA or CDS)')
    args = parser.parse_args()

    (assemblies, features) = gff.get_gff3_features(args.input_file)

    # set this to None if you don't want the debug print statements
    #debugging_gene = 'D9AE6116893A0D5711D56C0F1E6CF58C'
    debugging_gene = None

    if args.fasta is not None:
        seqs = utils.fasta_dict_from_file(args.fasta)
        for seq_id in seqs:
            if seq_id in assemblies:
                assemblies[seq_id].residues = seqs[seq_id]['s']
                assemblies[seq_id].length = len(assemblies[seq_id].residues)

    ## output will either be a file or STDOUT
    ofh = sys.stdout
    if args.output_file is not None:
        ofh = open(args.output_file, 'wt')

    for assembly_id in assemblies:
        assembly = assemblies[assembly_id]
        
        for gene in assembly.genes():

            if debugging_gene is not None:
                debug_mode = True
                if gene.id != debugging_gene: continue
            else:
                debug_mode = False

            if gene.locus_tag is None:
                gene_label = gene.id
            else:
                gene_label = gene.locus_tag
            
            gene_seq = gene.get_residues().upper()
            gene_loc = gene.location_on(assembly)

            ## we have to do this here because of the coordinates
            if gene_loc.strand == -1:
                gene_seq = "".join(reversed(gene_seq))

            if debug_mode:
                print("INFO: Processing gene with length {0} at {1}-{2}".format(len(gene_seq), gene_loc.fmin, gene_loc.fmax))

            if len(gene.mRNAs()) > 1:
                #raise Exception("ERROR: script doesn't currently support multi-isoform genes, but found one: {0}".format(gene.id))
                print("ERROR: skipping gene {0} because it appears to have multiple isoforms (not currently supported)".format(gene.id))
                continue

            
            for mRNA in gene.mRNAs():
                introns = mRNA.introns( on=assembly )

                # this helps us get where the intron is on the gene
                offset = gene_loc.fmin
                
                for intron in introns:
                    intron_loc = intron.location_on(assembly)
                    lower_mid = gene_seq[intron_loc.fmin - offset:intron_loc.fmax - offset].lower()
                    gene_seq = gene_seq[0:intron_loc.fmin - offset] + lower_mid + gene_seq[intron_loc.fmax - offset:]

                    if debug_mode:
                        print("INFO:\tfound intron at {0}-{1}".format(intron_loc.fmin, intron_loc.fmax))
                        print("INFO:\tlower-casing offset adjusted coordinates: {0}-{1}".format(intron_loc.fmin - offset, intron_loc.fmax - offset))
                        print("INFO:\tgenerating lower case seq of length: {0}\n".format(len(lower_mid)) )

                if debug_mode:
                    print("INFO: seq length before CDS processing is: {0}".format(len(gene_seq)))

                ## do we need to trim down to the CDS range?
                if args.type == 'CDS':
                    CDSs = sorted(mRNA.CDSs())
                    CDS_min = CDSs[0].location_on(assembly).fmin
                    CDS_max = CDSs[-1].location_on(assembly).fmax

                    if debug_mode:
                        print("INFO: Calculated CDS range, with introns, should be: {0}-{1}={2}".format(CDS_max, CDS_min, CDS_max - CDS_min))

                    if gene_loc.fmin != CDS_min or gene_loc.fmax != CDS_max:
                        fmin_chomp = CDS_min - offset
                        fmax_chomp = gene_loc.fmax - CDS_max

                        if debug_mode:
                            print("gene:{0} coords:{1}-{2} ({3}), CDS coords: {4}-{5}".format(gene.id, gene_loc.fmin, \
                                                                                      gene_loc.fmax, gene_loc.strand, \
                                                                                      CDS_min, CDS_max \
                                                                                     ))

                            print("\tfmin_chomp:{0}, fmax_chomp:{1}".format(fmin_chomp, fmax_chomp))
                            print("\tpulling range: gene_seq[{0} : {1}]".format(fmin_chomp, len(gene_seq) - fmax_chomp))
                            
                        gene_seq = gene_seq[fmin_chomp : len(gene_seq) - fmax_chomp]

                        if debug_mode:
                            print("\tGene {0} CDS seq: {1}".format(gene.id, gene_seq))

            ## make sure to switch it back
            if gene_loc.strand == -1:
                gene_seq = "".join(reversed(gene_seq))
                    
            #print("INFO: Got gene with length {0} after modification".format(len(gene_seq)))
            ofh.write(">{0}\n{1}\n".format(gene_label, utils.wrapped_fasta(gene_seq)))
def main():
    parser = argparse.ArgumentParser(description='')

    ## output file to be written
    parser.add_argument('-i',
                        '--input_file',
                        type=str,
                        required=True,
                        help='Path to the input GFF3')
    parser.add_argument('-o',
                        '--output_file',
                        type=str,
                        required=False,
                        help='Path to an output file to be created')
    parser.add_argument(
        '-f',
        '--fasta',
        type=str,
        required=False,
        help='Required if you don\'t have GFF3 with embedded FASTA')
    parser.add_argument('-t',
                        '--type',
                        type=str,
                        required=False,
                        default='mRNA',
                        choices=['mRNA', 'CDS'],
                        help='Feature type to export (mRNA or CDS)')
    args = parser.parse_args()

    (assemblies, features) = gff.get_gff3_features(args.input_file)

    # set this to None if you don't want the debug print statements
    #debugging_gene = 'D9AE6116893A0D5711D56C0F1E6CF58C'
    debugging_gene = None

    if args.fasta is not None:
        seqs = utils.fasta_dict_from_file(args.fasta)
        for seq_id in seqs:
            if seq_id in assemblies:
                assemblies[seq_id].residues = seqs[seq_id]['s']
                assemblies[seq_id].length = len(assemblies[seq_id].residues)

    ## output will either be a file or STDOUT
    ofh = sys.stdout
    if args.output_file is not None:
        ofh = open(args.output_file, 'wt')

    for assembly_id in assemblies:
        assembly = assemblies[assembly_id]

        for gene in assembly.genes():

            if debugging_gene is not None:
                debug_mode = True
                if gene.id != debugging_gene: continue
            else:
                debug_mode = False

            if gene.locus_tag is None:
                gene_label = gene.id
            else:
                gene_label = gene.locus_tag

            gene_seq = gene.get_residues().upper()
            gene_loc = gene.location_on(assembly)

            ## we have to do this here because of the coordinates
            if gene_loc.strand == -1:
                gene_seq = "".join(reversed(gene_seq))

            if debug_mode:
                print(
                    "INFO: Processing gene with length {0} at {1}-{2}".format(
                        len(gene_seq), gene_loc.fmin, gene_loc.fmax))

            if len(gene.mRNAs()) > 1:
                #raise Exception("ERROR: script doesn't currently support multi-isoform genes, but found one: {0}".format(gene.id))
                print(
                    "ERROR: skipping gene {0} because it appears to have multiple isoforms (not currently supported)"
                    .format(gene.id))
                continue

            for mRNA in gene.mRNAs():
                introns = mRNA.introns(on=assembly)

                # this helps us get where the intron is on the gene
                offset = gene_loc.fmin

                for intron in introns:
                    intron_loc = intron.location_on(assembly)
                    lower_mid = gene_seq[intron_loc.fmin -
                                         offset:intron_loc.fmax -
                                         offset].lower()
                    gene_seq = gene_seq[0:intron_loc.fmin -
                                        offset] + lower_mid + gene_seq[
                                            intron_loc.fmax - offset:]

                    if debug_mode:
                        print("INFO:\tfound intron at {0}-{1}".format(
                            intron_loc.fmin, intron_loc.fmax))
                        print(
                            "INFO:\tlower-casing offset adjusted coordinates: {0}-{1}"
                            .format(intron_loc.fmin - offset,
                                    intron_loc.fmax - offset))
                        print(
                            "INFO:\tgenerating lower case seq of length: {0}\n"
                            .format(len(lower_mid)))

                if debug_mode:
                    print("INFO: seq length before CDS processing is: {0}".
                          format(len(gene_seq)))

                ## do we need to trim down to the CDS range?
                if args.type == 'CDS':
                    CDSs = sorted(mRNA.CDSs())
                    CDS_min = CDSs[0].location_on(assembly).fmin
                    CDS_max = CDSs[-1].location_on(assembly).fmax

                    if debug_mode:
                        print(
                            "INFO: Calculated CDS range, with introns, should be: {0}-{1}={2}"
                            .format(CDS_max, CDS_min, CDS_max - CDS_min))

                    if gene_loc.fmin != CDS_min or gene_loc.fmax != CDS_max:
                        fmin_chomp = CDS_min - offset
                        fmax_chomp = gene_loc.fmax - CDS_max

                        if debug_mode:
                            print("gene:{0} coords:{1}-{2} ({3}), CDS coords: {4}-{5}".format(gene.id, gene_loc.fmin, \
                                                                                      gene_loc.fmax, gene_loc.strand, \
                                                                                      CDS_min, CDS_max \
                                                                                     ))

                            print("\tfmin_chomp:{0}, fmax_chomp:{1}".format(
                                fmin_chomp, fmax_chomp))
                            print(
                                "\tpulling range: gene_seq[{0} : {1}]".format(
                                    fmin_chomp,
                                    len(gene_seq) - fmax_chomp))

                        gene_seq = gene_seq[fmin_chomp:len(gene_seq) -
                                            fmax_chomp]

                        if debug_mode:
                            print("\tGene {0} CDS seq: {1}".format(
                                gene.id, gene_seq))

            ## make sure to switch it back
            if gene_loc.strand == -1:
                gene_seq = "".join(reversed(gene_seq))

            #print("INFO: Got gene with length {0} after modification".format(len(gene_seq)))
            ofh.write(">{0}\n{1}\n".format(gene_label,
                                           utils.wrapped_fasta(gene_seq)))
예제 #34
0
 def load_from_file(self, file):
     seqs = utils.fasta_dict_from_file(file)
     
     for seq_id in seqs:
         polypeptide = Polypeptide(id=seq_id, residues=seqs[seq_id]['s'])
         self.add(polypeptide)
예제 #35
0
def main():
    parser = argparse.ArgumentParser( description='Provides coverage information for features in a GFF3 file')

    ## output file to be written
    parser.add_argument('evidence_files', metavar='N', type=str, nargs='+', help='Path to one or more evidence files, separated by spaces' )
    parser.add_argument('-r', '--reference', type=str, required=True, help='Input path to the reference GFF3 file. So we know what feature type to report on, format should be like FILE:TYPE' )
    parser.add_argument('-f', '--fasta', type=str, required=True, help='Input path to the reference FASTA file.' )
    parser.add_argument('-o', '--output_file', type=str, required=False, help='Optional path to an output file to be created, else prints on STDOUT' )
    args = parser.parse_args()

    ## parse the fasta
    fasta = utils.fasta_dict_from_file(args.fasta)

    ## open the output file
    fout = None
    if args.output_file is None:
        fout = codecs.getwriter('utf8')(sys.stdout.buffer)
    else:
        fout = open(args.output_file, "w")

    ####################################################
    ## Sanity checks

    allowed_extensions = ['bed', 'gff3', 'pileup', 'sam']
    for ev_file in args.evidence_files:
        valid_ext_found = False
        
        for ext in allowed_extensions:
            if ev_file.endswith(ext):
                valid_ext_found = True

        if valid_ext_found == False:
            raise Exception("ERROR: Evidence file passed with unsupported file extension: {0}.  Supported extensions are {1}".format(ev_file, allowed_extensions))

    ## The input file should be defined as $path:$feattype
    if ':' not in args.reference:
        raise Exception("ERROR: input_file must be like /path/to/some.gff3:mRNA")
        
    ref_file_parts = args.reference.split(':')
    print("DEBUG: part count: {0}".format(len(ref_file_parts)))
        
    if ref_file_parts[0].endswith('.gff3'):
        (ref_assemblies, ref_features) = gff.get_gff3_features(ref_file_parts[0])
    else:
        raise Exception("ERROR: Expected input file (-i) to have a gff3 extension, got {0}".format(ref_file_parts[0]))

    ####################################################
    ## Initialize the coverage arrays

    fasta_cov = dict()
    for seq_id in fasta:
        # create a list of 0s the length of the molecule
        fasta_cov[seq_id] = [0] * len(fasta[seq_id]['s'])

    ####################################################
    ## Now parse the evidence files
        
    for ev_file in args.evidence_files:
        if ev_file.endswith('pileup'):
            parse_pileup(fasta_cov, ev_file)
        elif ev_file.endswith('sam'):
            parse_sam(fasta_cov, ev_file)
        else:
            print("INFO: ignoring evidence file {0} because code to handle its file type isn't currently implemented".format(ev_file))
        

    for id in fasta_cov:
        covered_bases = 0

        for i in fasta_cov[id]:
            if fasta_cov[id][i] > 0:
                covered_bases += 1

        fout.write("{0}\t{1}\t{2}\n".format(id, len(fasta[id]['s']), covered_bases))
def main():
    parser = argparse.ArgumentParser(
        description='Put a description of your script here')

    parser.add_argument('-a',
                        '--organism1_annotation',
                        type=str,
                        required=True,
                        help='Annotation GFF for organism 1')
    parser.add_argument('-p',
                        '--organism1_aat_alignments',
                        type=str,
                        required=True,
                        help='Path to AAT GFF3 (match/match_part)')
    parser.add_argument('-aatdb',
                        '--aat_fasta_db',
                        type=str,
                        required=True,
                        help='Path to FASTA database that was used in AAT')
    parser.add_argument('-b',
                        '--organism1_blast_alignments',
                        type=str,
                        required=True,
                        help='Path to BLASTp btab file vs.organism 2 proteins')
    parser.add_argument('-be',
                        '--blast_eval_cutoff',
                        type=float,
                        required=False,
                        default=1e-5,
                        help='BLAST e-value cutoff')
    parser.add_argument('-bpi',
                        '--blast_percent_identity_cutoff',
                        type=float,
                        required=False,
                        default=0,
                        help='BLAST %identity cutoff')
    parser.add_argument(
        '-ppc',
        '--aat_percent_coverage_cutoff',
        type=float,
        required=False,
        default=0,
        help='% coverage of the query protein by the AAT match')
    parser.add_argument('-o',
                        '--output_id_list',
                        type=str,
                        required=False,
                        help='List of IDs from organism1 that passed')
    args = parser.parse_args()

    debugging_transcript = None

    ## if the output file wasn't passed build one from the other parameters
    if args.output_id_list is None:
        args.output_id_list = "training_ids.be_{0}.bpi_{1}.ppc_{2}.list".format(
            args.blast_eval_cutoff, args.blast_percent_identity_cutoff,
            args.aat_percent_coverage_cutoff)

    print("INFO: Parsing organism1 annotation")
    (assemblies, features) = gff.get_gff3_features(args.organism1_annotation)

    print("INFO: Parsing AAT FASTA database")
    aat_seqs = utils.fasta_dict_from_file(args.aat_fasta_db)

    # keys are assembly IDs, value for each is a list of matches on them
    aat_matches = dict()
    aat_match_count = 0
    current_match = None

    ## IDs of features in organism 1 which overlap AAT
    o1_with_aat = list()
    o1_with_o2 = list()

    print("INFO: Parsing organism1 AAT protein alignments")
    for line in open(args.organism1_aat_alignments):
        cols = line.split("\t")

        if line.startswith('#') or len(cols) != 9:
            continue

        assembly_id = cols[0]

        # skip this match if there were not predicted genes on the same assembly
        if assembly_id not in assemblies:
            continue

        if assembly_id not in aat_matches:
            aat_matches[assembly_id] = list()

        fmin = int(cols[3]) - 1
        fmax = int(cols[4])
        strand = cols[6]
        feature_id = gff.column_9_value(cols[8], 'ID').replace('"', '')
        target = gff.column_9_value(cols[8], 'Target')
        m = re.search("^(\S+)", target)
        if m:
            target = m.group(1)

        if cols[2] == 'nucleotide_to_protein_match':
            if current_match is not None:
                aat_matches[assembly_id].append(current_match)
                aat_match_count += 1

            current_match = things.Match(
                id=feature_id,
                target_id=target,
                subclass='nucleotide_to_protein_match',
                length=fmax - fmin)
            current_match.locate_on(target=assemblies[assembly_id],
                                    fmin=fmin,
                                    fmax=fmax,
                                    strand=strand)

        elif cols[2] == 'match_part':
            parent_id = gff.column_9_value(cols[8], 'Parent').replace('"', '')
            match_part = things.MatchPart(id=feature_id,
                                          parent=parent_id,
                                          length=fmax - fmin)
            match_part.locate_on(target=assemblies[assembly_id],
                                 fmin=fmin,
                                 fmax=fmax,
                                 strand=strand)
            current_match.add_part(match_part)

    print("INFO: Parsed {0} protein alignment chains".format(aat_match_count))

    print("INFO: Comparing organism1's mRNAs with AAT match coordinates")
    for assembly_id in assemblies:
        if assembly_id not in aat_matches:
            continue

        assembly = assemblies[assembly_id]

        for gene in assembly.genes():
            for mRNA in gene.mRNAs():

                if debugging_transcript is not None:
                    if mRNA.id == debugging_transcript:
                        print("DEBUG: processing debugging transcript: {0}".
                              format(mRNA.id))
                    else:
                        continue

                for aat_match in aat_matches[assembly_id]:
                    #print("DEBUG: about to call overlap_size_with {0} and {1}, which has {2} segments".format(mRNA.id, aat_match.id, len(aat_match.parts)) )
                    overlap_size = mRNA.overlap_size_with(aat_match)

                    if overlap_size is not None:
                        #print("DEBUG: {0}:({1}) overlaps (size:{2}) {3}:({4})".format(mRNA.id, mRNA.length, overlap_size, aat_match.id, aat_match.length) )
                        # this shouldn't be possible, but check just in case
                        if overlap_size > mRNA.length:
                            raise Exception(
                                "ERROR: overlap size ({0}) > mRNA length ({1})"
                                .format(overlap_size, mRNA.length))

                        if aat_match.target_id not in aat_seqs:
                            raise Exception(
                                "ERROR: Found match with target ID ({0}) but didn't find a FASTA entry for it via -aatdb"
                                .format(aat_match.target_id))

                        # this is a protein length, so x3
                        match_target_length = len(
                            aat_seqs[aat_match.target_id]['s']) * 3

                        (mRNA_percent_coverage, target_percent_coverage
                         ) = calculate_fragmented_coverage(
                             mRNA, aat_match, match_target_length)

                        #print("DEBUG: mRNA_percent_coverage:{0}".format(mRNA_percent_coverage) )
                        #print("DEBUG: match_percent_coverage:{0}".format(target_percent_coverage) )

                        if mRNA_percent_coverage >= args.aat_percent_coverage_cutoff and target_percent_coverage >= args.aat_percent_coverage_cutoff:
                            o1_with_aat.append(mRNA.id)
                            #print("DEBUG: {0}:({1}) overlaps (size:{2}) {3}:({4}), match target id:{5}, length:{6}".format( \
                            #        mRNA.id, mRNA.length, overlap_size, aat_match.id, aat_match.length, \
                            #        aat_match.target_id, match_target_length) )
                            #print("\tmRNA % cov: {0}".format(mRNA_percent_coverage))
                            #print("\ttarget % cov: {0}".format(target_percent_coverage))
                            break  # only need to see if one matched

    print(
        "INFO: Found {0} mRNAs in org1 with overlapping fungi AAT coordinates".
        format(len(o1_with_aat)))

    # key=org1_transcript_id, value=org2_transcript_id
    top_blast_hits = dict()

    print("INFO: parsing BLAST results vs. org2")
    for line in open(args.organism1_blast_alignments):
        cols = line.split("\t")

        if float(cols[19]) > args.blast_eval_cutoff:
            continue

        if float(cols[10]) < args.blast_percent_identity_cutoff:
            continue

        # if we survived until here, this one's good.
        top_blast_hits[cols[0]] = cols[5]

    print(
        "INFO: Comparing overlap between AAT-matched proteins and BLAST ones")
    for o1_mRNA_id in o1_with_aat:
        if o1_mRNA_id in top_blast_hits:
            o1_with_o2.append(o1_mRNA_id)

    print(
        "INFO: Found {0} mRNAs in org1 with overlapping AAT coordinates and BLAST hit to org2"
        .format(len(o1_with_o2)))

    id_list_fh = open(args.output_id_list, 'wt')
    for mRNA_id in o1_with_o2:
        id_list_fh.write("{0}\n".format(mRNA_id))
예제 #37
0
def main():
    parser = argparse.ArgumentParser( description='Put a description of your script here' )

    ## output file to be written
    parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to an input file to be read' )
    parser.add_argument('-o', '--output_file', type=str, required=True, help='Path to an output file to be created' )
    parser.add_argument('-f', '--fasta_file', type=str, required=True, help='Path to an input FASTA file' )
    args = parser.parse_args()

    ratios = list()

    # if set to true, the IDs in the mid-range will be printed to STDOUT
    print_ids = True
    
    RATIO_MIN = 0.05
    RATIO_MAX = 0.95

    #RATIO_MIN = 0.125
    #RATIO_MAX = 0.875

    #RATIO_MIN = 0.25
    #RATIO_MAX = 0.75

    #RATIO_MIN = 0.475
    #RATIO_MAX = 0.525
    
    LENGTH_CUTOFF = 350
    ratio_min_count = 0
    ratio_bet_count = 0
    ratio_max_count = 0

    fasta = utils.fasta_dict_from_file(args.fasta_file)

    for line in open(args.input_file):
        # lines are like: comp0_c0_seq1   1-T:6   1-F:0   2-T:0   2-F:5
        m = re.search('(.+)\t1-T:(\d+)\t1-F:(\d+)\t2-T:(\d+)\t2-F:(\d+)', line)
        if m:
            seq_id = m.group(1)

            if seq_id in fasta:
                if len(fasta[seq_id]['s']) < LENGTH_CUTOFF:
                    continue
            else:
                raise Exception("Expected but filed to find seq ID {0} in FASTA file".format(seq_id))
            
            f_reads_correctly_mapped = int(m.group(2))
            f_reads_incorrectly_mapped = int(m.group(3))
            r_reads_correctly_mapped = int(m.group(5))
            r_reads_incorrectly_mapped = int(m.group(4))
            f_read_count = f_reads_correctly_mapped + f_reads_incorrectly_mapped

            if f_read_count > 0:
                correct_ratio = f_reads_correctly_mapped / f_read_count
                ratios.append(correct_ratio)

                if correct_ratio < RATIO_MIN:
                    ratio_min_count += 1
                elif correct_ratio > RATIO_MAX:
                    ratio_max_count += 1
                else:
                    ratio_bet_count += 1
                    if print_ids == True:
                        print(seq_id)

                #print("LOG: Fcorrect:{0} Fwrong:{1} Ftotal:{2} ratio:{3}".format(f_reads_correctly_mapped, f_reads_incorrectly_mapped, f_read_count, correct_ratio))

    plt.hist(ratios, bins=100)
    plt.xlabel("Correct read orientation alignment ratio")
    plt.ylabel("Log of transcript count")
    plt.grid(True)
    #plt.ylim(0,5000)
    plt.gca().set_yscale("log")
    plt.savefig(args.output_file)

    sys.stderr.write("Count of ratios < {0}: {1}\n".format(RATIO_MIN, ratio_min_count))
    sys.stderr.write("Count where {0} > ratio < {1}: {2}\n".format(RATIO_MIN, RATIO_MAX, ratio_bet_count))
    sys.stderr.write("Count of ratios > {0}: {1}\n".format(RATIO_MAX, ratio_max_count))
예제 #38
0
 def load_from_file(self, file):
     seqs = utils.fasta_dict_from_file(file)
     
     for seq_id in seqs:
         mRNA = mRNA(id=seq_id, residues=seqs[seq_id]['s'])
         self.add(mRNA)
def main():
    parser = argparse.ArgumentParser( description='Use BLAST to identify internal inverted repeats')

    ## output file to be written
    parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to an input file to be read' )
    parser.add_argument('-o', '--output_file', type=str, required=True, help='Path to an output file to be created' )
    parser.add_argument('-n', '--min_repeat_size', type=int, required=True, help='Minimum size of a repeat to consider' )
    parser.add_argument('-pid', '--percent_identity', type=float, required=False, default=98.0, help='Percent identity cutoff' )
    args = parser.parse_args()

    # parse FASTA input, storing into a dict keyed by ID
    seqs = utils.fasta_dict_from_file(args.input_file)

    ofh = open(args.output_file, 'wt')

    seqs_processed = 0
    print_interval = 100

    for id in seqs:
        # Write a FASTA of just this sequence
        fasta_name = "{0}.temp.input.fasta".format(os.getpid())
        blast_name = "{0}.temp.blast.out".format(os.getpid())
        fasta_fh = open(fasta_name, 'wt')
        fasta_fh.write(">{0}\n{1}".format(id, seqs[id]['s']))
        fasta_fh.close()

        # Perform the blast using bl2seq
        #cmd = "bl2seq -i {0} -j {0} -p blastn -e 1e-10 -D 1 -o {1} -W {2}".format(fasta_name, blast_name, args.min_repeat_size)
        cmd = "blastn -query {0} -subject {0} -outfmt 6 -out {1} -word_size {2} -perc_identity {3}".format(fasta_name, blast_name, args.min_repeat_size, args.percent_identity)
        run_command(cmd)

        # Parse the result file to look for inverted repeats
        for line in open(blast_name):
            if line.startswith('#'):
                continue

            cols = line.split()
            qstart, qend, sstart, send = int(cols[6]), int(cols[7]), int(cols[8]), int(cols[9])

            if qstart < qend:
                q_orientation = 'F'
                match_len = qend - qstart + 1
            else:
                q_orientation = 'R'
                match_len = qstart - qend + 1

            if sstart < send:
                s_orientation = 'F'
            else:
                s_orientation = 'R'

            if s_orientation != q_orientation and match_len >= args.min_repeat_size:
                ofh.write("INVERSION of {5} bp in {4}: {0}\t{1}\t{2}\t{3}\n".format(qstart, qend, sstart, send, cols[0], match_len))

            if s_orientation == q_orientation and match_len >= args.min_repeat_size:
                if (qstart >= sstart and qstart <= send) or (qend >= sstart and qend <= send):
                    pass
                else:
                    ofh.write("DIRECT REPEAT of {5} bp in {4}: {0}\t{1}\t{2}\t{3}\n".format(qstart, qend, sstart, send, cols[0], match_len))
                    #ofh.write("# ^^ {0}".format(line))

        seqs_processed += 1

        if seqs_processed % print_interval == 0:
            print("INFO: processed {0} input sequences".format(seqs_processed))

    ofh.close()
예제 #40
0
def main():
    parser = argparse.ArgumentParser(
        description='Splits FASTA file based on reported coverage gaps')

    ## output file to be written
    parser.add_argument('-g',
                        '--gaps_file',
                        type=str,
                        required=True,
                        help='Path to an input gaps file to be read')
    parser.add_argument('-f',
                        '--fasta_file',
                        type=str,
                        required=True,
                        help='Path to an input FASTA file to be read')
    parser.add_argument('-o',
                        '--output_file',
                        type=str,
                        required=True,
                        help='Path to an output file to be created')
    parser.add_argument(
        '-mfl',
        '--min_fragment_length',
        type=int,
        required=False,
        help='Min length required for a fragment to be exported')
    parser.add_argument('-mgl',
                        '--min_gap_length',
                        type=int,
                        required=False,
                        help='Ignore gaps reported under this min length')
    args = parser.parse_args()

    fasta = utils.fasta_dict_from_file(args.fasta_file)

    # this is just to keep track of which we've exported
    molecules_split = dict()
    ofh = open(args.output_file, 'wt')

    last_molecule_id = None
    last_end_coordinate = None

    for line in open(args.gaps_file):
        cols = line.split('\t')
        mol_id, start, stop = cols[0], int(cols[1]), int(cols[2])

        # skip if this is too short
        if args.min_gap_length is not None and (stop - start +
                                                1) < args.min_gap_length:
            #print("DEBUG: skipping short gap {0} : {1}-{2}".format(mol_id, start, stop))
            continue

        if last_molecule_id is None:
            # first entry, export only beginning of molecule to gap start
            export_fragment(ofh, fasta, mol_id, 1, start - 1,
                            args.min_fragment_length, molecules_split)
            last_molecule_id = mol_id
            last_end_coordinate = stop

        elif mol_id != last_molecule_id:
            # new molecule, export end of last molecule
            last_molecule_length = len(fasta[last_molecule_id]['s'])
            export_fragment(ofh, fasta, last_molecule_id,
                            last_end_coordinate + 1, last_molecule_length,
                            args.min_fragment_length, molecules_split)
            # now export the beginning of this one unless the start is 1
            if start != 1:
                export_fragment(ofh, fasta, mol_id, 1, start - 1,
                                args.min_fragment_length, molecules_split)

            last_molecule_id = mol_id
            last_end_coordinate = stop
        else:
            # same molecule as we just saw, but new entry for it
            # export end of last gap until beginning of this one
            export_fragment(ofh, fasta, mol_id, last_end_coordinate + 1,
                            start - 1, args.min_fragment_length,
                            molecules_split)
            last_molecule_id = mol_id
            last_end_coordinate = stop

    # Now export the full sequences of any which weren't split
    for id in fasta:
        if id not in molecules_split:
            ofh.write(">{0}\n{1}\n".format(id,
                                           utils.wrapped_fasta(
                                               fasta[id]['s'])))
def main():
    parser = argparse.ArgumentParser( description='Reports on non-standard characters in multifasta files and can optionally replace residues')

    parser.add_argument('-i', '--input', type=str, required=True, help='Path to an input FASTA file' )
    parser.add_argument('-t', '--type', type=str, required=True, choices=('n', 'p'), help='Either n for nucleotide or p for protein')
    parser.add_argument('-o', '--output', type=str, required=False, help='Path to an output FASTA file to be created if doing replacement' )
    parser.add_argument('-pl', '--print_locations', dest='print_locations', action='store_true', help='If passed, will report coordinate of each non-standard residue on STDERR' )
    parser.add_argument('-r', '--replace', type=str, required=False, help='Replace this character with the one defined by --with_' )
    parser.add_argument('-w', '--with_', type=str, required=False, help='This character or set replaces all instances of the one found in --replace' )
    parser.add_argument('-l', '--list', type=str, required=False, help='Optional file of IDs where non-standard residues were detected or replaced' )
    parser.add_argument('-g', '--ignore', type=str, required=False, default='N*X', help='List of characters to not report as non-standard.  Default = the universal ambiguity bases (N, X) or the end-of-translation stop for proteins (*)' )
    parser.set_defaults(print_locations=False)
    args = parser.parse_args()

    if args.output is None:
        out_fh = sys.stdout
    else:
        out_fh = open( args.output, 'wt' )

    ## if you define --replace, you must also define --with_, and vice versa
    if args.replace is not None and args.with_ is None:
        raise Exception("ERROR: You must pass --with_ when passing --replace")
    if args.with_ is not None and args.replace is None:
        raise Exception("ERROR: You must pass --replace when passing --with_")

    seqs = utils.fasta_dict_from_file(args.input)

    ## standard characters (depends on the type of sequence)
    standard_residues = dict()
    if args.type == 'n':
        for base in list("ATGCU"):
            standard_residues[base] = 1
    else:
        for base in list("ACDEFGHIKLMNPQRSTVWY"):
            standard_residues[base] = 1

    if args.list is not None:
        list_fh = open(args.list, 'wt')

    ## build the lookup of characters to ignore
    ignore_residues = dict()
    for residue in list(args.ignore):
        ignore_residues[residue.upper()] = None

    ## process the sequences
    seqs_with_bad_chars = dict()
    
    for seq_id in seqs:
        i = 0
        seq = seqs[seq_id]
        bad_chars = dict()

        for base in list(seq['s']):
            i += 1
            ubase = base.upper()
            if ubase not in standard_residues and ubase not in ignore_residues:
                if ubase in bad_chars:
                    bad_chars[ubase] += 1
                else:
                    bad_chars[ubase] = 1

                if args.print_locations == True:
                    print("Molecule {0} contains residue {1} at position {2}".format(seq_id, ubase, i), file=sys.stderr)
        
        if args.list is not None and len(bad_chars) > 0:
            list_fh.write("{0}".format(seq_id))
            for base in bad_chars:
                list_fh.write( "\t{0}:{1}".format(base, bad_chars[base]) )

            list_fh.write("\n")
        
        if args.replace is not None:
            seq['s'] = seq['s'].replace(args.replace, args.with_)
            out_fh.write( ">{0} {1}\n".format(seq_id, seq['h']) )
            
            for i in range(0, len(seq['s']), 60):
                out_fh.write(seq['s'][i : i + 60] + "\n")
예제 #42
0
def main():
    parser = argparse.ArgumentParser(
        description='Extract regions from a multi-FASTA file')

    ## output file to be written
    parser.add_argument('-f',
                        '--fasta_file',
                        type=str,
                        required=True,
                        help='Path to an input FASTA file')
    parser.add_argument('-c',
                        '--coords_file',
                        type=str,
                        required=True,
                        help='Path to a tab-delimited file with coordinates')
    parser.add_argument('-m',
                        '--mol_col',
                        type=int,
                        required=True,
                        help='Tabdel file column with molecule identifiers')
    parser.add_argument(
        '-x',
        '--start_coord_col',
        type=int,
        required=True,
        help='Tabdel file column with coordinate start positions')
    parser.add_argument(
        '-y',
        '--stop_coord_col',
        type=int,
        required=True,
        help='Tabdel file column with coordinate stop positions')
    parser.add_argument(
        '-n',
        '--name_col',
        type=int,
        required=False,
        default=None,
        help='Optional tabdel file column with name for exported fragment')
    parser.add_argument('-o',
                        '--output_file',
                        type=str,
                        required=False,
                        default=None,
                        help='Optional Path to an output file to be created')
    args = parser.parse_args()

    ## output will either be a file or STDOUT
    fout = sys.stdout
    if args.output_file is not None:
        fout = open(args.output_file, 'wt')

    seqs = utils.fasta_dict_from_file(args.fasta_file)

    start_col = args.start_coord_col - 1
    stop_col = args.stop_coord_col - 1
    mol_col = args.mol_col - 1

    for line in open(args.coords_file):
        line = line.rstrip()
        cols = line.split('\t')

        if len(cols) < 3:
            continue

        (fmin, fmax,
         strand) = utils.humancoords_to_0interbase(int(cols[start_col]),
                                                   int(cols[stop_col]))
        mol_id = cols[mol_col]

        if mol_id not in seqs:
            raise Exception(
                "ERROR: molecule ID ({0}) not found in FASTA file".format(
                    mol_id))

        seq = seqs[mol_id]['s'][fmin:fmax]

        seq_id = None
        if args.name_col is None:
            seq_id = "{0}___{1}.{2}.{3}".format(mol_id, fmin, fmax, strand)
        else:
            seq_id = cols[int(args.name_col) - 1]

        if strand == -1:
            seq = utils.reverse_complement(seq)

        ## write this sequence, 60bp per line
        fout.write(">{0}\n".format(seq_id))
        for i in range(0, len(seq), 60):
            fout.write(seq[i:i + 60] + "\n")
예제 #43
0
def main():
    '''
    This script reports statistics on the areas of a genome where features aren't - introns and
    intergenic space.  Pass a valid GFF3 file (along with FASTA data) and get a report like this:

    Molecule count: 9

    Gene count: 4171
    Intergenic space count: 4061
    Average intergenic space distance: 361.7 bp
    Median intergenic space distance: 245 bp
    Minimum intergenic space distance: 0 bp
    Maximum intergenic space distance: 6272 bp

    Intron count: 10533
    Intron space count: 989024
    Average intron size: 93.9 bp
    Median intron size: 63 bp
    Minimum intron size: 2 bp
    Maximum intron size: 1676 bp


    Optionally, you can pass the path to a PNG file to be created using the --histogram parameter,
    which will generate a size distribution histogram with two overlaying plots - one representing
    the distribution of intergenic region sizes and the other the intron lengths.  Because these
    can often have long tails, you can limit both the Y- and X-axes values with the --ylimit and
    --xlimit options, respectively.

    FASTA:
    If your FASTA isn't embedded at the end of your GFF3 file after a ##FASTA directive you'll need
    to specify the --fasta option in this script and pass it as a separate file.

    Definitions:
    Intergenic space was a little ambiguous to me as I started writing this.  Does one count the space from
    the beginning of the contig until the first gene, or only between them?  What about short contigs which
    have no annotated genes at all?  From the Sequence Ontology:

    SO:0000605: A region containing or overlapping no genes that is bounded on either side by a gene, or
    bounded by a gene and the end of the chromosome.

    To my reading, this includes contig ends but not gene-less contigs.  To that end, I include the
    former in intergenic space reporting but include the latter as a separate statistic.

    Author: Joshua Orvis (jorvis AT gmail)
    '''
    parser = argparse.ArgumentParser( description='Reports statistics of reference gene coverage and extension by aligned RNA-seq transcript data.')

    ## output file to be written
    parser.add_argument('-i', '--input_gff3', type=str, required=True, help='GFF3 file of a reference annotation' )
    parser.add_argument('-g', '--histogram', type=str, required=False, help='Optional path to a histogram of intron/intergenic space size distribution to be created (PNG)' )
    parser.add_argument('-x', '--xlimit', type=int, required=False, help='Use this if you want to limit the X-axis of the histogram (feature length)' )
    parser.add_argument('-y', '--ylimit', type=int, required=False, help='Use this if you want to limit the Y-axis of the histogram (feature count)' )
    parser.add_argument('-f', '--fasta', type=str, required=False, help='Required if you don\'t have GFF3 with embedded FASTA')
    args = parser.parse_args()

    (assemblies, features) = gff.get_gff3_features(args.input_gff3)

    if args.fasta is not None:
        seqs = utils.fasta_dict_from_file(args.fasta)
        for seq_id in seqs:
            if seq_id in assemblies:
                assemblies[seq_id].residues = seqs[seq_id]['s']
                assemblies[seq_id].length = len(assemblies[seq_id].residues)

    ## things to keep stats on and report
    total_molecule_count = len(assemblies)
    total_gene_count = 0
    
    ## this number is NOT just the total genes N - 1, since there can be multiple molecules
    #   genes can overlap, etc.
    total_intergenic_space_count = 0
    
    total_intergenic_space_residues = 0
    intergenic_distances = list()

    total_contig_residues = 0
    empty_contig_residues = 0

    total_intron_count = 0
    total_intron_residues = 0
    intron_sizes = list()

    ############################
    ## Calculation section
    ############################

    for asm_id in assemblies:
        #print("DEBUG: processing assembly: {0}".format(asm_id))
        assembly = assemblies[asm_id]
        genes = sorted(assembly.genes())
        total_gene_count += len(genes)
        previous_gene_loc = None

        # we should have a length here
        if assembly.length is None or assembly.length == 0:
            raise Exception("ERROR: Detected assembly with undefined or 0 length: {0}".format(assembly.id))

        if total_gene_count == 0:
            empty_contig_residues += assembly.length
            continue

        total_contig_residues += assembly.length
        first_gene_loc = None
        last_gene_loc = None

        for gene in genes:
            gene_loc = gene.location_on(assembly)

            # if this is the first gene, track the number of bases from the start of the molecule here
            if first_gene_loc is None:
                total_intergenic_space_count += 1
                intergenic_distance = gene_loc.fmin
                total_intergenic_space_residues += intergenic_distance
                intergenic_distances.append(intergenic_distance)
                first_gene_loc = gene_loc

            if previous_gene_loc is not None:
                ## skip this gene if it overlaps the previous
                if gene_loc.fmin < previous_gene_loc.fmax:
                    if gene_loc.fmax > previous_gene_loc.fmax:
                        previous_gene_loc = gene_loc

                else:
                    total_intergenic_space_count += 1
                    intergenic_distance = gene_loc.fmin - previous_gene_loc.fmax
                    total_intergenic_space_residues += intergenic_distance
                    intergenic_distances.append(intergenic_distance)
                    
            for mRNA in gene.mRNAs():
                introns = mRNA.introns( on=assembly )

                for intron in sorted(introns):
                    total_intron_count += 1
                    intron_loc = intron.location_on(assembly)
                    intron_size = intron_loc.fmax - intron_loc.fmin

                    #if intron_size > 0:
                        #print("\tDEBUG: found mRNA:{0} intron {1}-{2} ({3} bp)".format(mRNA.id, intron_loc.fmin, intron_loc.fmax, intron_size))

                    if intron_size < 0:
                        print("\tWARN: Intron size ({1}) < 0 reported in gene {0}".format(gene.id, intron_size))
                    
                    intron_sizes.append(intron_size)
                    total_intron_residues += intron_size
                
            previous_gene_loc = gene_loc
            last_gene_loc = previous_gene_loc
        
        if last_gene_loc is not None:
            total_intergenic_space_count += 1
            intergenic_distance = assembly.length - last_gene_loc.fmax
            total_intergenic_space_residues += intergenic_distance
            intergenic_distances.append(intergenic_distance)

    if total_intergenic_space_count == 0:
        avg_intergenic_space_dist = None
        intergenic_distances = None
        median_int_space_dist = None
    else:
        avg_intergenic_space_dist = total_intergenic_space_residues / total_intergenic_space_count
        intergenic_distances = sorted(intergenic_distances)
        median_int_space_dist = intergenic_distances[ int(len(intergenic_distances)/2) ]

    avg_intron_size = total_intron_residues / total_intron_count
    intron_sizes = sorted(intron_sizes)
    median_intron_size = intron_sizes[int(len(intron_sizes)/2)]
            
    ############################
    ## Reporting section
    ############################

    print("\nMolecule count: {0}".format(total_molecule_count))
    print("Gene count: {0}".format(total_gene_count) )

    print("\nTotal molecule bases: {0} bp".format(total_contig_residues) )
    print("Empty molecule bases: {0} bp".format(empty_contig_residues) )

    if total_intergenic_space_count > 0:
        print("Intergenic space count: {0}".format(total_intergenic_space_count) )
        print("Average intergenic space distance: {0:.1f} bp".format(avg_intergenic_space_dist) )
        print("Median intergenic space distance: {0} bp".format(median_int_space_dist) )
        print("Minimum intergenic space distance: {0} bp".format(intergenic_distances[0]) )
        print("Maximum intergenic space distance: {0} bp\n".format(intergenic_distances[-1]) )
    else:
        print("There were no intergenic spaces found.  This might mean there were no molecules with at least 2 genes.")
 
    print("Intron count: {0}".format(total_intron_count) )
    print("Intron space count: {0}".format(total_intron_residues) )

    print("Average intron size: {0:.1f} bp".format(avg_intron_size) )
    print("Median intron size: {0} bp".format(median_intron_size) )
    print("Minimum intron size: {0} bp".format(intron_sizes[0]) )
    print("Maximum intron size: {0} bp\n".format(intron_sizes[-1]) )
    
    ############################
    ## Graphics section (optional)
    ############################
    if args.histogram is not None:
        import matplotlib.pyplot as plt

        plt.xlabel('length (bp)')
        plt.ylabel('count')
        plt.title('Distribution of intron size and intergenic distances')
        plt.hist(intergenic_distances, bins=50, histtype='stepfilled', color='b', label='Intergenic distances' )
        plt.hist(intron_sizes, bins=50, histtype='stepfilled', color='r', alpha=0.5, label='Intron sizes' )

        if args.xlimit is not None:
            plt.xlim([0, args.xlimit])
        
        if args.ylimit is not None:
            plt.ylim([0, args.ylimit])

        plt.legend(loc='best')
        plt.savefig(args.histogram)
예제 #44
0
def main():
    parser = argparse.ArgumentParser(
        description=
        'Split multi-FASTA file into separate protein and nucleotide files')

    ## output file to be written
    parser.add_argument('-i',
                        '--input',
                        type=str,
                        required=True,
                        help='Path to an input FASTA file')
    parser.add_argument('-p',
                        '--protein',
                        type=str,
                        required=False,
                        help='Path to a tab-delimited file with coordinates')
    parser.add_argument('-n',
                        '--nucleotide',
                        type=str,
                        required=False,
                        help='Tabdel file column with molecule identifiers')
    parser.add_argument(
        '-c',
        '--cutoff',
        type=str,
        required=False,
        default=80,
        help=
        'Min percent (1-100) of ATGCNX content to be considered a nucleotide sequence'
    )
    args = parser.parse_args()

    pout = nout = None

    if args.protein is not None:
        pout = open(args.protein, 'wt')

    if args.nucleotide is not None:
        nout = open(args.nucleotide, 'wt')

    ## the user should have specified at least one
    if pout is None and nout is None:
        raise Exception(
            "ERROR: you must specify either -p or -n options (else why are you running this script?"
        )

    seqs = utils.fasta_dict_from_file(args.input)

    for seq_id in seqs:
        seq = seqs[seq_id]
        seqcomp = nucleotide_composition(seq['s'])
        seq_wrapped = wrapped(seq['s'], every=60)

        if seqcomp >= args.cutoff:
            ## it's a nucleotide
            if nout is not None:
                nout.write(">{0} {1}\n{2}\n".format(seq_id, seq['h'],
                                                    seq_wrapped))

        else:
            ## it's a protein
            if pout is not None:
                pout.write(">{0} {1}\n{2}\n".format(seq_id, seq['h'],
                                                    seq_wrapped))
예제 #45
0
 def load_from_file(self, file):
     seqs = utils.fasta_dict_from_file(file)
     
     for seq_id in seqs:
         assembly = Assembly(id=seq_id, residues=seqs[seq_id]['s'])
         self.add(assembly)
예제 #46
0
def main():
    parser = argparse.ArgumentParser(description='Merge masked FASTA files')

    ## output file to be written
    parser.add_argument('fasta_files',
                        metavar='N',
                        type=str,
                        nargs='+',
                        help='Pass one or more FASTA files')
    parser.add_argument('-o',
                        '--output_file',
                        type=str,
                        required=False,
                        help='Path to an output file to be created')
    args = parser.parse_args()

    ## output will either be a file or STDOUT
    ofh = sys.stdout
    if args.output_file is not None:
        ofh = open(args.output_file, 'wt')

    files = args.fasta_files

    # pull off a file and index it
    seqs = utils.fasta_dict_from_file(files.pop())

    # python strings are immutable, so we need to transform these into lists
    for seq_id in seqs:
        seqs[seq_id]['s'] = list(seqs[seq_id]['s'])

    for fasta_file in args.fasta_files:
        new_seqs = utils.fasta_dict_from_file(fasta_file)

        for seq_id in new_seqs:
            # make sure it exists in the source file
            if seq_id not in seqs:
                raise Exception(
                    "ERROR: Seq ID {0} was found in file {1} but not in the seed file"
                    .format(seq_id, fasta_file))

            # they should also be the same length
            if len(seqs[seq_id]) != len(new_seqs[seq_id]):
                raise Exception(
                    "ERROR: Seq ID {0} was found in {1} and the seed file but had different lengths"
                    .format(seq_id, fasta_file))

            i = 0
            for base in new_seqs[seq_id]['s']:
                if base != seqs[seq_id]['s'][i]:
                    if base == 'N':
                        seqs[seq_id]['s'][i] = 'N'
                    elif seqs[seq_id]['s'][i] != 'N':
                        print("WARNING: Disagreement {0}-{1} at position {2}".
                              format(base, seqs[seq_id]['s'][i], i))

                i += 1

    # now done, print out the results
    for seq_id in seqs:
        ofh.write(">{0} {1}\n{2}\n".format(
            seq_id, seqs[seq_id]['h'],
            utils.wrapped_fasta(''.join(seqs[seq_id]['s']))))
def main():
    parser = argparse.ArgumentParser( description='Parses nucmer coords output to provide an overall coverage report')

    ## coords file generated with: show-coords -l -r -T out.delta
    parser.add_argument('-c', '--coords_file', type=str, required=True, \
                            help='Path to a nucmer coords file with non-overlapping results (requires -l -r -T options of show-coords)' )
    parser.add_argument('-o', '--output_prefix', type=str, required=True, help='Several output files will be created with this prefix.' )
    parser.add_argument('-a', '--annotation_file', type=str, required=True, help='Path to a sorted GFF3 annotation file' )
    parser.add_argument('-r', '--reference_fasta', type=str, required=True, help='Path to the reference file used with nucmer' )
    parser.add_argument('-k', '--annotation_key', type=str, required=False, help='Optional.  Key string to look for in the 9th column of the GFF3 file for an annotation string.' )
    args = parser.parse_args()

    ## like: h[$assem] = [ {id=?,fmin=?,fmax=?}, ...  ]
    annot = parse_annotation( args.annotation_file, args.annotation_key )

    ## like: [ {id=?,qfmin=?,qfmax=?,rfmin=?,rfmax=?} ]
    query_fragments = []

    ref_molecules = utils.fasta_dict_from_file(args.reference_fasta)
    ref_n_total = 0

    for ref_id in ref_molecules:
        ref_n_total += len( ref_molecules[ref_id]['s'] )

    ## open the output files
    genecov_stats_ofh     = open(args.output_prefix + ".stats.gene_coverage", "wt")
    genesmissing_list_ofh = open(args.output_prefix + ".list.genes_missing", "wt")
    refmol_stats_ofh      = open(args.output_prefix + ".stats.refmol_coverage", "wt")
    refcov_stats_ofh      = open(args.output_prefix + ".tab.refmol_coverage", "wt")
    refext_list_ofh       = open(args.output_prefix + ".tab.extensions", "wt")
    genecov_tab_ofh       = open(args.output_prefix + ".tab.gene_coverage", "wt")
    refext_list_ofh.write("# {0}\n".format(args.output_prefix) )
    refext_list_ofh.write("# reference_id\tref_fmin\tref_fmax\tref_strand\tqry_id\tqry_fmin\tqry_fmax\tqry_strand\tqry_length\n");
    
    ref_cov_stats = { 'n_cov': 0, 'n_uncov': 0, 'n_identical': 0 }

    alignment_lines_found = 0
    current_ref_id = None

    for line in open(args.coords_file, 'r'):
        cols = line.split()

        if len(cols) == 11:
            alignment_lines_found += 1
        else:
            continue

        cols[0] = int(cols[0])
        cols[1] = int(cols[1])
        cols[2] = int(cols[2])
        cols[3] = int(cols[3])
        
        if cols[9] != current_ref_id:
            if current_ref_id is not None:
                if current_ref_id in annot:
                    calculate_gene_coverage_fragments( annot[current_ref_id], query_fragments )
                    
                calculate_fragment_coverage( current_ref_id, query_fragments, current_ref_length, ref_cov_stats, refcov_stats_ofh, refext_list_ofh )
                
            ## reset
            current_ref_id = cols[9]
            current_ref_length = int(cols[7])
            query_fragments = []

            ## quick sanity check
            if current_ref_id not in annot:
                print("WARNING: found a nucleotide accession for which we have no annotation: {0}".format(current_ref_id))
        
        qstrand = 1

        if cols[2] > cols[3]:
            qstrand = -1

        fragment = {}
        fragment['id'] = cols[10]
        fragment['qfmin']   = min(cols[2], cols[3]) - 1
        fragment['qfmax']   = max(cols[2], cols[3])
        fragment['qlen']    = int(cols[8])
        fragment['qstrand'] = qstrand
        fragment['rfmin']   = min(cols[0], cols[1]) - 1
        fragment['rfmax']   = max(cols[0], cols[1])
        fragment['rlen']    = int(cols[7])
        fragment['pctid']   = float(cols[6])
        query_fragments.append(fragment)

    ## don't forget the last one
    if current_ref_id is not None:
        if current_ref_id in annot:
            calculate_gene_coverage_fragments( annot[current_ref_id], query_fragments )
        
        calculate_fragment_coverage( current_ref_id, query_fragments, current_ref_length, ref_cov_stats, refcov_stats_ofh, refext_list_ofh )
    
    if alignment_lines_found == 0:
        raise Exception("ERROR: failed to find any 11-column alignment lines")
    else:
        print("INFO: {0} alignment lines found".format(alignment_lines_found) )

    report_gene_coverage_results( annot, genecov_stats_ofh, genesmissing_list_ofh, genecov_tab_ofh )
    
    cov_perc = (ref_cov_stats['n_cov'] / ref_n_total) * 100
    cov_perc_id =(ref_cov_stats['n_identical'] / ref_n_total) * 100
    refmol_stats_ofh.write("Total bases in reference molecules\t{0}\n".format(ref_n_total) )
    refmol_stats_ofh.write("Ref bases covered by query fragments\t{0}\n".format(ref_cov_stats['n_cov']) )
    refmol_stats_ofh.write("Ref % covered by query fragments\t{0:.2f}\n".format(cov_perc))
    refmol_stats_ofh.write("Ref % identity by query fragments\t{0:.2f}\n".format(cov_perc_id))
예제 #48
0
def main():
    parser = argparse.ArgumentParser(description="Splits FASTA file based on reported coverage gaps")

    ## output file to be written
    parser.add_argument("-g", "--gaps_file", type=str, required=True, help="Path to an input gaps file to be read")
    parser.add_argument("-f", "--fasta_file", type=str, required=True, help="Path to an input FASTA file to be read")
    parser.add_argument("-o", "--output_file", type=str, required=True, help="Path to an output file to be created")
    parser.add_argument(
        "-mfl",
        "--min_fragment_length",
        type=int,
        required=False,
        help="Min length required for a fragment to be exported",
    )
    parser.add_argument(
        "-mgl", "--min_gap_length", type=int, required=False, help="Ignore gaps reported under this min length"
    )
    args = parser.parse_args()

    fasta = utils.fasta_dict_from_file(args.fasta_file)

    # this is just to keep track of which we've exported
    molecules_split = dict()
    ofh = open(args.output_file, "wt")

    last_molecule_id = None
    last_end_coordinate = None

    for line in open(args.gaps_file):
        cols = line.split("\t")
        mol_id, start, stop = cols[0], int(cols[1]), int(cols[2])

        # skip if this is too short
        if args.min_gap_length is not None and (stop - start + 1) < args.min_gap_length:
            # print("DEBUG: skipping short gap {0} : {1}-{2}".format(mol_id, start, stop))
            continue

        if last_molecule_id is None:
            # first entry, export only beginning of molecule to gap start
            export_fragment(ofh, fasta, mol_id, 1, start - 1, args.min_fragment_length, molecules_split)
            last_molecule_id = mol_id
            last_end_coordinate = stop

        elif mol_id != last_molecule_id:
            # new molecule, export end of last molecule
            last_molecule_length = len(fasta[last_molecule_id]["s"])
            export_fragment(
                ofh,
                fasta,
                last_molecule_id,
                last_end_coordinate + 1,
                last_molecule_length,
                args.min_fragment_length,
                molecules_split,
            )
            # now export the beginning of this one unless the start is 1
            if start != 1:
                export_fragment(ofh, fasta, mol_id, 1, start - 1, args.min_fragment_length, molecules_split)

            last_molecule_id = mol_id
            last_end_coordinate = stop
        else:
            # same molecule as we just saw, but new entry for it
            # export end of last gap until beginning of this one
            export_fragment(
                ofh, fasta, mol_id, last_end_coordinate + 1, start - 1, args.min_fragment_length, molecules_split
            )
            last_molecule_id = mol_id
            last_end_coordinate = stop

    # Now export the full sequences of any which weren't split
    for id in fasta:
        if id not in molecules_split:
            ofh.write(">{0}\n{1}\n".format(id, utils.wrapped_fasta(fasta[id]["s"])))