Ejemplo n.º 1
0
def main():
    parser = argparse.ArgumentParser(
        description='Put a description of your script here')

    ## output file to be written
    parser.add_argument('-v',
                        '--vcf_list',
                        type=str,
                        required=True,
                        help='Input list of VCF files')
    parser.add_argument('-o',
                        '--output',
                        type=str,
                        required=True,
                        help='Base output path to be created.')
    args = parser.parse_args()

    vcf_files = utils.read_list_file(args.vcf_list)

    for vcf_file in vcf_files:
        plot_vcf_file(vcf_file)

    plt.legend(loc='best')
    plt.xlabel("Genotype quality")
    plt.ylabel("Divergence from reference (SNPs per kb)")
    plt.savefig("{0}.png".format(args.output))
Ejemplo n.º 2
0
def main():
    parser = argparse.ArgumentParser( description='Merge multiple BAM files, retaining headers')

    ## output file to be written
    parser.add_argument('-i', '--input_list', type=str, required=True, help='Path to an input list file to be read' )
    parser.add_argument('-o', '--output_base', type=str, required=True, help='Base name/path of output file to be created' )
    args = parser.parse_args()

    bam_files = utils.read_list_file(args.input_list)

    ## take the header from the first file, change the sorting definition
    first_file = bam_files[0]
    run_command("samtools view -H {0} | head -n 1 > {0}.firstline".format(first_file))
    
    ofh = open("{0}.sam".format(args.output_base), 'wt')
    for line in open("{0}.firstline".format(first_file)):
        ## we expect: @HD     VN:1.0  SO:coordinate
        m = re.match("^@HD\s+VN\:(\S+)\s+SO:", line)
        if m:
            ofh.write("@HD\tVN:{0}\tSO:unknown\n".format(m.group(1)))
    ofh.close()

    ## merge headers
    for bam_file in bam_files:
        run_command("samtools view -H {0} | grep -v '@HD' >> {1}.sam".format(bam_file, args.output_base))

    ## merge sequence files
    for bam_file in bam_files:
        run_command("samtools view {0} >> {1}.sam".format(bam_file, args.output_base))

    ## convert SAM to BAM
    run_command("samtools view -bS {0}.sam > {0}.bam".format(args.output_base))

    ## delete the SAM
    run_command("rm {0}.sam".format(args.output_base))
def main():
    parser = argparse.ArgumentParser( description='Looks for FASTA entries which seem to have been embedded within another')
    parser.add_argument('-l', '--input_list', type=str, required=False, help='A list file with the paths of all files' )
    parser.add_argument('-f', '--fasta_file', type=str, required=False, help='The single FASTA file to be processed' )
    args = parser.parse_args()

    if args.input_list is None and args.fasta_file is None:
        raise Exception("ERROR: You must specify either --input_list or --fasta_file to define input");

    files_to_process = list()

    if args.input_list is not None:
        for file in read_list_file( args.input_list ):
            files_to_process.append(file)

    if args.fasta_file is not None:
        files_to_process.append(args.fasta_file)

    for file in files_to_process:
        fail_lines = process_fasta_file( file )

        if ( len(fail_lines) > 0 ):
            print( "{0}\t{1}\n".format(file, len(fail_lines)) )

            for line in fail_lines:
                print("\t{0}\n".format(line))
Ejemplo n.º 4
0
def main():
    parser = argparse.ArgumentParser(
        description=
        'Parses BLAST results to include or exclude entries from a FASTA file')
    parser.add_argument(
        '-l',
        '--blast_tab_list',
        type=str,
        required=True,
        help=
        'A list file with the paths of all files in -m 8 output format from NCBI blast'
    )
    parser.add_argument('-f',
                        '--fasta_file',
                        type=str,
                        required=True,
                        help='The FASTA file to be filtered')
    parser.add_argument(
        '-m',
        '--mode',
        type=str,
        required=True,
        choices=['include', 'exclude'],
        help=
        'Enter either "include" or "exclude" based on the mode of filtering you want based on the blast hits.'
    )
    parser.add_argument('-o',
                        '--output_file',
                        type=str,
                        required=True,
                        help='Path to an output file to be created')
    parser.add_argument('-e',
                        '--eval_cutoff',
                        type=str,
                        required=True,
                        help='Filter entries with an E-value lower than this')
    args = parser.parse_args()

    ## 3 different options for each accession in the fasta_file
    #   - have no key here - wasn't found in the BLAST matches at all
    #   - key with value of 0 - found in BLAST, but with scores not meeting cutoffs
    #   - key with value of 1 - found in BLAST, with scores meeting cutoffs
    ids_matched = {}
    c_ids_matching_cutoffs = 0

    for file in read_list_file(args.blast_tab_list):
        c_ids_matching_cutoffs += process_blast_file(file, ids_matched, args)

    [c_total, c_kept] = export_file(args.fasta_file, args.output_file,
                                    args.mode, ids_matched)

    print("INFO: Exported {0} entries out of {1} total in the source file".
          format(c_kept, c_total))
def parse_hmm_evidence( log_fh, polypeptides, htab_list, cursor ):
    '''
    Reads a list file of HMM evidence and dict of polypeptides, populating each with
    Annotation evidence where appropriate.  Each file in the list can have results
    for multiple queries, but it's assumed that ALL candidate matches for any given
    query are grouped together.

    Currently only the top hit for any given query polypeptide is used.
    '''
    for file in utils.read_list_file(htab_list):
        last_qry_id = None
        
        for line in open(file):
            line = line.rstrip()
            cols = line.split("\t")
            
            ## only consider the row if the total score is above the total trusted cutoff
            if cols[12] >= cols[17]:
                continue

            this_qry_id = cols[5]
            accession = cols[0]
            version = None

            # if this is a PFAM accession, handle the version
            m = re.match("^(PF\d+)\.\d+", accession)
            if m:
                version = accession
                accession = m.group(1)

            ## the HMM hits are sorted already with the top hit for each query first
            if last_qry_id != this_qry_id:
                ## save it
                annot = polypeptides[this_qry_id].annotation
                annot.product_name = cols[15]
                log_fh.write("INFO: {0}: Updated product name to '{1}' based on HMM hit to accession '{2}'".format(this_qry_id, annot.product_name, accession))
                
                # does our hmm database provide GO terms for this accession?
                for go_annot in get_hmmdb_go_terms( accession, cursor ):
                    annot.add_go_annotation(go_annot)

                # do we have a gene symbol for this accession?
                annot.gene_symbol = get_hmmdb_gene_symbol( accession, cursor )

                # do we have an EC number?
                for ec_annot in get_hmmdb_ec_nums( accession, cursor ):
                    annot.add_ec_number(ec_annot)

                ## remember the ID we just saw
                last_qry_id = this_qry_id
def parse_trembl_blast_evidence(polypeptides, blast_list, eval_cutoff):
    '''
    Reads a list file of NCBI BLAST evidence against TrEMBL and a dict of polypeptides,
    populating each with Annotation evidence where appropriate.  Only attaches evidence if
    the product name is the default.

    Currently only considers the top BLAST hit for each query which doesn't have
    'uncharacterized' in the product name.
    '''
    for file in utils.read_list_file(blast_list):
        last_qry_id = None
        
        for line in open(file):
            line = line.rstrip()
            cols = line.split("\t")

            # We're going to ignore any lines which have 'uncharacterized' in the name
            if 'ncharacterized' in cols[15]:
                continue
            
            this_qry_id = cols[0]

            # skip this line if it doesn't meet the cutoff
            if float(cols[19]) > eval_cutoff:
                continue

            # the BLAST hits are sorted already with the top hit for each query first
            if last_qry_id != this_qry_id:
                annot = polypeptides[this_qry_id].annotation

                # get the accession from the cols[5]
                #  then process for known accession types
                accession = cols[5]

                # save it, unless the gene product name has already changed from the default
                if annot.product_name == DEFAULT_PRODUCT_NAME:
                    # current hack until DB is updated:
                    # some products look like this:
                    #    Coatomer subunit gamma-2 OS=Bos taurus GN=COPG2 PE=2 SV=1
                    # take off everything after the OS=
                    m = re.search("(.+) OS=", cols[15])

                    if m:
                        annot.product_name = m.group(1)
                    else:
                        annot.product_name = cols[15]

                # remember the ID we just saw
                last_qry_id = this_qry_id
def main():
    parser = argparse.ArgumentParser(description="Put a description of your script here")

    ## output file to be written
    parser.add_argument("-v", "--vcf_list", type=str, required=True, help="Input list of VCF files")
    parser.add_argument("-o", "--output", type=str, required=True, help="Base output path to be created.")
    args = parser.parse_args()

    vcf_files = utils.read_list_file(args.vcf_list)

    for vcf_file in vcf_files:
        plot_vcf_file(vcf_file)

    plt.legend(loc="best")
    plt.xlabel("Genotype quality")
    plt.ylabel("Divergence from reference (SNPs per kb)")
    plt.savefig("{0}.png".format(args.output))
def main():
    parser = argparse.ArgumentParser( description='Looks for FASTA entries which seem to have been embedded within another')
    parser.add_argument('-l', '--input_list', type=str, required=False, help='A list file with the paths of all files' )
    parser.add_argument('-f', '--fasta_file', type=str, required=False, help='The single FASTA file to be processed' )
    parser.add_argument('-b', '--basename', type=str, required=True, help='Base name to be used for new FASTA files' )
    parser.add_argument('-o', '--output_file', type=str, required=False, default=None, help='Optional path to an output file to be created' )
    args = parser.parse_args()

    if args.input_list is None and args.fasta_file is None:
        raise Exception("ERROR: You must specify either --input_list or --fasta_file to define input");

    ## output will either be a file or STDOUT
    fout = sys.stdout
    if args.output_file is not None:
        fout = open(args.output_file, 'wt')

    files_to_process = list()

    if args.input_list is not None:
        for file in read_list_file( args.input_list ):
            files_to_process.append(file)

    if args.fasta_file is not None:
        files_to_process.append(args.fasta_file)

    next_id_idx = 1

    for file in files_to_process:
        for line in open(file):
            if line[0] == '>':
                line = line.rstrip()
                next_id = "{0}.{1}".format(args.basename, next_id_idx)
                next_id_idx += 1
                
                m = re.match(">\S+ (.*)", line)
                if m:
                    fout.write(">{0} {1}\n".format(next_id, m.groups(1)))
                else:
                    fout.write(">{0}\n".format(next_id))
            else:
                fout.write(line)
Ejemplo n.º 9
0
def main():
    parser = argparse.ArgumentParser( description='Parses BLAST results to include or exclude entries from a FASTA file')
    parser.add_argument('-l', '--blast_tab_list', type=str, required=True, help='A list file with the paths of all files in -m 8 output format from NCBI blast' )
    parser.add_argument('-f', '--fasta_file', type=str, required=True, help='The FASTA file to be filtered' )
    parser.add_argument('-m', '--mode', type=str, required=True, choices=['include','exclude'], help='Enter either "include" or "exclude" based on the mode of filtering you want based on the blast hits.')
    parser.add_argument('-o', '--output_file', type=str, required=True, help='Path to an output file to be created' )
    parser.add_argument('-e', '--eval_cutoff', type=str, required=True, help='Filter entries with an E-value lower than this' )
    args = parser.parse_args()

    ## 3 different options for each accession in the fasta_file
    #   - have no key here - wasn't found in the BLAST matches at all
    #   - key with value of 0 - found in BLAST, but with scores not meeting cutoffs
    #   - key with value of 1 - found in BLAST, with scores meeting cutoffs
    ids_matched = {}
    c_ids_matching_cutoffs = 0

    for file in read_list_file( args.blast_tab_list ):
        c_ids_matching_cutoffs += process_blast_file(file, ids_matched, args)

    [c_total, c_kept] = export_file( args.fasta_file, args.output_file, args.mode, ids_matched )

    print("INFO: Exported {0} entries out of {1} total in the source file".format(c_kept, c_total) )
def main():
    parser = argparse.ArgumentParser(
        description=
        'Looks for FASTA entries which seem to have been embedded within another'
    )
    parser.add_argument('-l',
                        '--input_list',
                        type=str,
                        required=False,
                        help='A list file with the paths of all files')
    parser.add_argument('-f',
                        '--fasta_file',
                        type=str,
                        required=False,
                        help='The single FASTA file to be processed')
    args = parser.parse_args()

    if args.input_list is None and args.fasta_file is None:
        raise Exception(
            "ERROR: You must specify either --input_list or --fasta_file to define input"
        )

    files_to_process = list()

    if args.input_list is not None:
        for file in read_list_file(args.input_list):
            files_to_process.append(file)

    if args.fasta_file is not None:
        files_to_process.append(args.fasta_file)

    for file in files_to_process:
        fail_lines = process_fasta_file(file)

        if (len(fail_lines) > 0):
            print("{0}\t{1}\n".format(file, len(fail_lines)))

            for line in fail_lines:
                print("\t{0}\n".format(line))
Ejemplo n.º 11
0
def main():
    parser = argparse.ArgumentParser( description='Performs selected validation of a FASTA file')

    RESIDUE_LINE_LENGTH_LIMIT = 60

    ## output file to be written
    parser.add_argument('fasta_files', metavar='N', type=str, nargs='*', help='Pass one or more FASTA files')
    parser.add_argument('-o', '--output_file', type=str, required=False, default=None, help='Optional path to an output file to be created' )
    parser.add_argument('-l', '--input_list', type=str, required=False, default=None, help='Optional path to a list of list of input files' )
    parser.add_argument('-erc', '--expected_record_count', type=int, required=False, default=None, help='Optional count of records expected in the input.  An exception is raised if this is not matched.' )
    parser.add_argument('-hl', '--homopolymer_limit', type=int, required=False, default=None, help='Issues a warning for any sequences with a homopolymer of length > this' )
    parser.add_argument('-cis', '--check_internal_stops', dest='check_internal_stops', action='store_true', help='Meant when processing protein files, checks for internal * characters')
    parser.set_defaults(check_internal_stops=False)
    args = parser.parse_args()

    ## output will either be a file or STDOUT
    fout = sys.stdout
    if args.output_file is not None:
        fout = open(args.output_file, 'wt')

    ## gather input
    input_files = args.fasta_files

    if args.input_list is not None:
        input_files.extend(utils.read_list_file(args.input_list))

    if len(input_files) == 0:
        raise Exception("ERROR: No input files defined")
        
    total_records = 0
    total_residues = 0
    error_count = 0
    warning_count = 0

    for ifile in input_files:
        fout.write("INFO: validating file {0}\n".format(ifile))
        line_number = 0
        ids_found = list()
        long_line_count = 0
        current_seq_length = None
        current_seq_id = None
        current_seq = ''
        current_homopolymer_base = None
        current_homopolymer_length = 0
        
        for line in open(ifile):
            line = line.rstrip()
            line_number += 1
            
            if line.startswith('>'):
                total_records += 1

                # found a new sequence entry - make sure the last one (if there was one) had residues
                if current_seq_length == 0:
                    fout.write("ERROR: Entry without residues found just before line {0} of {1}\n".format(line_number, ifile))
                    error_count += 1

                if args.check_internal_stops == True:
                    if '*' in current_seq[0:-1]:
                        fout.write("WARNING: Internal stops found within {0} of {1}\n".format(current_seq_id, ifile))
                        warning_count += 1
                
                current_seq = ''
                current_seq_length = 0
                m = re.match(">(\S+)", line)
                if m:
                    current_seq_id = m.group(1)
                    if current_seq_id in ids_found:
                        fout.write("ERROR: Duplicate ID ({2}) found on line {0} of {1}\n".format(line_number, ifile, current_seq_id))
                        error_count += 1
                    else:
                        ids_found.append(current_seq_id)
                else:
                    fout.write("ERROR: Record without ID on line {0} of {1}\n".format(line_number, ifile))
                    error_count += 1
                
            elif line.startswith('#'):
                # warn about a comment line
                fout.write("WARNING: Comment detected on line {0} of {1}\n".format(line_number, ifile))
                warning_count += 1

            else:
                # residue line
                total_residues += len(line)
                current_seq_length += len(line)
                current_seq += line

                if args.homopolymer_limit is not None:
                    for base in line:
                        if base == current_homopolymer_base:
                            current_homopolymer_length += 1
                        else:
                            if current_homopolymer_length > args.homopolymer_limit and current_homopolymer_base != 'N':
                                fout.write("WARNING: Sequence ID {0} in file {1} contains a homopolymer run ({2}) of length {3}\n".format(current_seq_id, ifile, current_homopolymer_base, current_homopolymer_length))
                                warning_count += 1

                            current_homopolymer_base = base
                            current_homopolymer_length = 1

                if '>' in line:
                    fout.write("ERROR: > character embedded in sequence residues on line {0} of {1}\n".format(line_number, ifile))
                    error_count += 1

                # not practical to print warnings for each line here, so we'll just do it once per file
                if len(line) > RESIDUE_LINE_LENGTH_LIMIT:
                    long_line_count += 1

        if current_seq_length == 0:
            fout.write("ERROR: Entry without residues found on line {0} of {1}\n".format(line_number, ifile))
            error_count += 1
        
        if long_line_count > 0:
            fout.write("WARNING: {0} residue line(s) were detected longer than {1} in file {2}\n".format(long_line_count, RESIDUE_LINE_LENGTH_LIMIT, ifile))
            warning_count += long_line_count

    if args.expected_record_count is not None and args.expected_record_count != total_records:
        fout.write("ERROR: Expected record count:{0} does not match what was found:{1}\n".format(args.expected_record_count, total_records))
        error_count += 1

    if error_count == 0:
        if warning_count == 0:
            fout.write("INFO: All files in input set appear to be valid\n")
        else:
            fout.write("INFO: All files in input set appear to be valid, but with {0} warning(s)\n".format(warning_count))
    else:
        fout.write("ERROR: total errors found in all files from input set: {0}, warning(s): {1}\n".format(error_count, warning_count))
Ejemplo n.º 12
0
def main():
    parser = argparse.ArgumentParser( description="Validates file extension agrees with file's text delimiter")
    parser.add_argument('-f', '--input_file', type=str, required=False, help='Path to an input file' )
    parser.add_argument('-l', '--input_list', type=str, required=False, help='Path to an input list file' )
    args = parser.parse_args()

    files = list()

    if args.input_file is not None:
        files.append(args.input_file)
    elif args.input_list is not None:
        files = utils.read_list_file(args.input_list)
    else:
        raise Exception("ERROR: You must pass either -i or -l options")

    changed_num = 0
    for file in files:
        if file.lower().endswith( ('.svg', '.png', '.jpeg', '.jpg') ):
            #skip image files
            continue


        print("Processing file: {0}".format(file))

        file_extension = os.path.splitext(file)[1]
        print("... file extension: {0}".format(file_extension))

        fh = open(file)

        #Check if TAB file is tab delimited
        if file_extension == '.tab':
            try:
                reader = csv.reader(fh, delimiter='\t')
                for line in reader:
                    if len(line) > 1:
                        print("... no changed needed.")
                        break
                    else:
                        raise Exception("... Exception: Incorrect file extension.")
            except:
                new_file = file.replace(file_extension, '.csv')
                os.rename(file, new_file)
                print("... extension changed to CSV")

                changed_num += 1

        else:
            try:
                reader = csv.reader(fh)
                for line in reader:
                    if len(line) > 1:
                        print("... no changed needed.")
                        break
                    else:
                        raise Exception("... Exception: Incorrect file extension.")
            except:
                new_file = file.replace(file_extension, '.tab')
                os.rename(file, new_file)
                print("... extension changed to TAB")

                changed_num += 1

        fh.close()

    print("\nFinished.\n{0} Files renamed.".format(changed_num))
def parse_sprot_blast_evidence( log_fh, polypeptides, blast_org, blast_list, cursor, eval_cutoff ):
    '''
    Reads a list file of NCBI BLAST evidence and a dict of polypeptides, populating
    each with Annotation evidence where appropriate.  Only attaches evidence if
    the product name is the default.

    Currently only considers the top BLAST hit for each query.
    '''
    for file in utils.read_list_file(blast_list):
        last_qry_id = None
        
        for line in open(file):
            line = line.rstrip()
            cols = line.split("\t")
            this_qry_id = cols[0]

            # skip this line if it doesn't meet the cutoff
            if float(cols[19]) > eval_cutoff:
                continue

            # the BLAST hits are sorted already with the top hit for each query first
            if last_qry_id != this_qry_id:
                annot = polypeptides[this_qry_id].annotation

                # get the accession from the cols[5]
                #  then process for known accession types
                accession = cols[5]

                if accession.startswith('sp|'):
                    # pluck the second part out of this:
                    #  sp|Q4PEV8|EIF3M_USTMA
                    accession = accession.split('|')[1]

                assertions = get_uspdb_annot( accession, cursor )
                blast_org[this_qry_id] = assertions['organism']
                    
                # save it, unless the gene product name has already changed from the default
                if annot.product_name == DEFAULT_PRODUCT_NAME:
                    # current hack until DB is updated:
                    # some products look like this:
                    #    Coatomer subunit gamma-2 OS=Bos taurus GN=COPG2 PE=2 SV=1
                    # take off everything after the OS=
                    m = re.search("(.+) OS=", cols[15])

                    if m:
                        annot.product_name = m.group(1)
                    else:
                        annot.product_name = cols[15]

                    log_fh.write("INFO: {0}: Updated product name to '{1}' based on BLAST hit to SPROT accession '{2}'".format(this_qry_id, annot.product_name, accession))

                # if no EC numbers have been set, they can inherit from this
                if len(annot.ec_numbers) == 0:
                    for ec_annot in get_uspdb_ec_nums( accession, cursor ):
                        annot.add_ec_number(ec_annot)

                # if no GO IDs have been set, they can inherit from this
                if len(annot.go_annotations) == 0:
                    for go_annot in get_uspdb_go_terms( accession, cursor ):
                        annot.add_go_annotation(go_annot)

                # if no gene symbol has been set, it can inherit from this
                if annot.gene_symbol is None:
                    annot.gene_symbol = assertions['symbol']

                # remember the ID we just saw
                last_qry_id = this_qry_id
def main():
    """This is the second script I've written in Python.  I'm sure it shows."""
    parser = argparse.ArgumentParser( description='Reads a BLAST m8 file and taxonomy DB to produce a taxonomic profile at any user-specified ranking level.')

    ## input formats: btab, blast_m8
    parser.add_argument('-f', '--input_format', type=str, required=True, help='Blast format: current options are btab or blast_m8' )

    ## The SQLite3 file that will be read for taxonomy information
    parser.add_argument('-t', '--taxonomy_db', type=str, required=True, help='Path to a taxonomy.db file created by "create_taxonomy_db.py"' )

    ## BLAST list file
    parser.add_argument('-b', '--blast_list_file', type=str, required=True, help='List of BLAST files (m8 format)' )

    ## output file to be written
    parser.add_argument('-o', '--output_file', type=str, required=True, help='Path where the result file should written' )

    ## E-value cutoff to use
    parser.add_argument('-e', '--eval_cutoff', type=float, required=False, help='Optional E-value cutoff to use.' )

    ## Top N hits per query to score.  Only counts those where the taxon could be looked up in the indexes
    parser.add_argument('-n', '--top_n', type=int, required=False, default=1, help=' Top N hits per query to score.  Only counts unique taxon matches which could be looked up in the indexes' )

    ## rank on which matches will be grouped and reported.  values like: species, genus, order, family, etc.
    parser.add_argument('-r', '--rank', type=str, required=True, help='Taxonomy rank on which to group all matches, such as: species, genus, order, family, etc.' )
    args = parser.parse_args()

    conn = sqlite3.connect( args.taxonomy_db )
    c = conn.cursor()

    blast_files = read_list_file( args.blast_list_file )

    taxon_counts = {}
    processed_file_count = 0
    stats = {}
    stats['gi_lookup_success_count'] = 0
    stats['gi_lookup_fail_count'] = 0
    stats['taxon_lookup_success_count'] = 0
    stats['taxon_lookup_failure_count'] = 0
     
    for file in blast_files:
        print("Processing file: ", file)
        if args.input_format == 'blast_m8' or args.input_format == 'btab':
            parse_blast_file( file, c, taxon_counts, args.eval_cutoff, args.input_format, stats, args.top_n )
        else:
            raise Exception("Unsupported input format passed: {0}".format(args.input_format) )

        processed_file_count += 1
        #if processed_file_count == 50:
            #break
   
    ## process the taxon counts, conforming them to the user-specified rank
    result_table = group_taxa_by_rank( args.rank, taxon_counts, c )
    node_names = get_selected_node_names( result_table, c )

    c.close()

    fout = open(args.output_file, mode='w')

    ## write the results to the output file in order of most-found clade first
    for tax_id in OrderedDict(sorted(result_table.items(), reverse=True, key=lambda t: t[1])):
        sci_name = ''
        if tax_id in node_names:
            sci_name = node_names[tax_id]
            
        fout.write( "{0}\t{1}\t{2}\n".format(tax_id, int(result_table[tax_id]), sci_name ) )

    fout.close()

    print("INFO: successful GI lookups: {0}/{1}".format(stats['gi_lookup_success_count'], \
                                                        (stats['gi_lookup_fail_count'] + stats['gi_lookup_success_count'])) )
    print("INFO: successful taxon lookups: {0}/{1}".format( stats['taxon_lookup_success_count'], \
                                                        (stats['taxon_lookup_success_count'] + stats['taxon_lookup_failure_count']) ) )
Ejemplo n.º 15
0
def main():
    parser = argparse.ArgumentParser(
        description=
        'Looks for FASTA entries which seem to have been embedded within another'
    )
    parser.add_argument('-l',
                        '--input_list',
                        type=str,
                        required=False,
                        help='A list file with the paths of all files')
    parser.add_argument('-f',
                        '--fasta_file',
                        type=str,
                        required=False,
                        help='The single FASTA file to be processed')
    parser.add_argument('-b',
                        '--basename',
                        type=str,
                        required=True,
                        help='Base name to be used for new FASTA files')
    parser.add_argument('-o',
                        '--output_file',
                        type=str,
                        required=False,
                        default=None,
                        help='Optional path to an output file to be created')
    args = parser.parse_args()

    if args.input_list is None and args.fasta_file is None:
        raise Exception(
            "ERROR: You must specify either --input_list or --fasta_file to define input"
        )

    ## output will either be a file or STDOUT
    fout = sys.stdout
    if args.output_file is not None:
        fout = open(args.output_file, 'wt')

    files_to_process = list()

    if args.input_list is not None:
        for file in read_list_file(args.input_list):
            files_to_process.append(file)

    if args.fasta_file is not None:
        files_to_process.append(args.fasta_file)

    next_id_idx = 1

    for file in files_to_process:
        for line in open(file):
            if line[0] == '>':
                line = line.rstrip()
                next_id = "{0}.{1}".format(args.basename, next_id_idx)
                next_id_idx += 1

                m = re.match(">\S+ (.*)", line)
                if m:
                    fout.write(">{0} {1}\n".format(next_id, m.groups(1)))
                else:
                    fout.write(">{0}\n".format(next_id))
            else:
                fout.write(line)
Ejemplo n.º 16
0
def main():
    """This is the second script I've written in Python.  I'm sure it shows."""
    parser = argparse.ArgumentParser(
        description=
        'Reads a BLAST m8 file and taxonomy DB to produce a taxonomic profile at any user-specified ranking level.'
    )

    ## input formats: btab, blast_m8
    parser.add_argument(
        '-f',
        '--input_format',
        type=str,
        required=True,
        help='Blast format: current options are btab or blast_m8')

    ## The SQLite3 file that will be read for taxonomy information
    parser.add_argument(
        '-t',
        '--taxonomy_db',
        type=str,
        required=True,
        help='Path to a taxonomy.db file created by "create_taxonomy_db.py"')

    ## BLAST list file
    parser.add_argument('-b',
                        '--blast_list_file',
                        type=str,
                        required=True,
                        help='List of BLAST files (m8 format)')

    ## output file to be written
    parser.add_argument('-o',
                        '--output_file',
                        type=str,
                        required=True,
                        help='Path where the result file should written')

    ## E-value cutoff to use
    parser.add_argument('-e',
                        '--eval_cutoff',
                        type=float,
                        required=False,
                        help='Optional E-value cutoff to use.')

    ## Top N hits per query to score.  Only counts those where the taxon could be looked up in the indexes
    parser.add_argument(
        '-n',
        '--top_n',
        type=int,
        required=False,
        default=1,
        help=
        ' Top N hits per query to score.  Only counts unique taxon matches which could be looked up in the indexes'
    )

    ## rank on which matches will be grouped and reported.  values like: species, genus, order, family, etc.
    parser.add_argument(
        '-r',
        '--rank',
        type=str,
        required=True,
        help=
        'Taxonomy rank on which to group all matches, such as: species, genus, order, family, etc.'
    )
    args = parser.parse_args()

    conn = sqlite3.connect(args.taxonomy_db)
    c = conn.cursor()

    blast_files = read_list_file(args.blast_list_file)

    taxon_counts = {}
    processed_file_count = 0
    stats = {}
    stats['gi_lookup_success_count'] = 0
    stats['gi_lookup_fail_count'] = 0
    stats['taxon_lookup_success_count'] = 0
    stats['taxon_lookup_failure_count'] = 0

    for file in blast_files:
        print("Processing file: ", file)
        if args.input_format == 'blast_m8' or args.input_format == 'btab':
            parse_blast_file(file, c, taxon_counts, args.eval_cutoff,
                             args.input_format, stats, args.top_n)
        else:
            raise Exception("Unsupported input format passed: {0}".format(
                args.input_format))

        processed_file_count += 1
        #if processed_file_count == 50:
        #break

    ## process the taxon counts, conforming them to the user-specified rank
    result_table = group_taxa_by_rank(args.rank, taxon_counts, c)
    node_names = get_selected_node_names(result_table, c)

    c.close()

    fout = open(args.output_file, mode='w')

    ## write the results to the output file in order of most-found clade first
    for tax_id in OrderedDict(
            sorted(result_table.items(), reverse=True, key=lambda t: t[1])):
        sci_name = ''
        if tax_id in node_names:
            sci_name = node_names[tax_id]

        fout.write("{0}\t{1}\t{2}\n".format(tax_id, int(result_table[tax_id]),
                                            sci_name))

    fout.close()

    print("INFO: successful GI lookups: {0}/{1}".format(stats['gi_lookup_success_count'], \
                                                        (stats['gi_lookup_fail_count'] + stats['gi_lookup_success_count'])) )
    print("INFO: successful taxon lookups: {0}/{1}".format( stats['taxon_lookup_success_count'], \
                                                        (stats['taxon_lookup_success_count'] + stats['taxon_lookup_failure_count']) ) )
Ejemplo n.º 17
0
def parse_sprot_blast_evidence(log_fh, polypeptides, blast_org, blast_list,
                               cursor, eval_cutoff):
    '''
    Reads a list file of NCBI BLAST evidence and a dict of polypeptides, populating
    each with Annotation evidence where appropriate.  Only attaches evidence if
    the product name is the default.

    Currently only considers the top BLAST hit for each query.
    '''
    for file in utils.read_list_file(blast_list):
        last_qry_id = None

        for line in open(file):
            line = line.rstrip()
            cols = line.split("\t")
            this_qry_id = cols[0]

            # skip this line if it doesn't meet the cutoff
            if float(cols[19]) > eval_cutoff:
                continue

            # the BLAST hits are sorted already with the top hit for each query first
            if last_qry_id != this_qry_id:
                annot = polypeptides[this_qry_id].annotation

                # get the accession from the cols[5]
                #  then process for known accession types
                accession = cols[5]

                if accession.startswith('sp|'):
                    # pluck the second part out of this:
                    #  sp|Q4PEV8|EIF3M_USTMA
                    accession = accession.split('|')[1]

                assertions = get_uspdb_annot(accession, cursor)
                blast_org[this_qry_id] = assertions['organism']

                # save it, unless the gene product name has already changed from the default
                if annot.product_name == DEFAULT_PRODUCT_NAME:
                    # current hack until DB is updated:
                    # some products look like this:
                    #    Coatomer subunit gamma-2 OS=Bos taurus GN=COPG2 PE=2 SV=1
                    # take off everything after the OS=
                    m = re.search("(.+) OS=", cols[15])

                    if m:
                        annot.product_name = m.group(1)
                    else:
                        annot.product_name = cols[15]

                    log_fh.write(
                        "INFO: {0}: Updated product name to '{1}' based on BLAST hit to SPROT accession '{2}'"
                        .format(this_qry_id, annot.product_name, accession))

                # if no EC numbers have been set, they can inherit from this
                if len(annot.ec_numbers) == 0:
                    for ec_annot in get_uspdb_ec_nums(accession, cursor):
                        annot.add_ec_number(ec_annot)

                # if no GO IDs have been set, they can inherit from this
                if len(annot.go_annotations) == 0:
                    for go_annot in get_uspdb_go_terms(accession, cursor):
                        annot.add_go_annotation(go_annot)

                # if no gene symbol has been set, it can inherit from this
                if annot.gene_symbol is None:
                    annot.gene_symbol = assertions['symbol']

                # remember the ID we just saw
                last_qry_id = this_qry_id
def parse_kegg_blast_evidence(log_fh, polypeptides, blast_list, eval_cutoff):
    '''
    Reads a list file of NCBI BLAST evidence against KEGG and a dict of polypeptides,
    populating each with Annotation evidence where appropriate.  Only attaches evidence if
    the product name is the default.

    Currently only considers the top BLAST hit for each query which doesn't have
    'uncharacterized' or hypothetical in the product name.
    '''
    for file in utils.read_list_file(blast_list):
        last_qry_id = None
        
        for line in open(file):
            line = line.rstrip()
            cols = line.split("\t")

            # We're going to ignore any lines which have a few keywords in the name
            # First character left off for initcap reasons
            if 'ncharacterized' in cols[15] or 'ypothetical' in cols[15]:
                continue
            
            this_qry_id = cols[0]

            # skip this line if it doesn't meet the cutoff
            if float(cols[19]) > eval_cutoff:
                continue

            # the BLAST hits are sorted already with the top hit for each query first
            if last_qry_id != this_qry_id:
                annot = polypeptides[this_qry_id].annotation

                # get the accession from the cols[5]
                accession = cols[5]

                # save it, unless the gene product name has already changed from the default
                if annot.product_name == DEFAULT_PRODUCT_NAME:
                    accession = cols[5]

                    # the product field looks like this:
                    # dam; adenine-specific DNA methyltransferase; K06223 DNA adenine methylase [EC:2.1.1.72]
                    # troponin I type 1 (skeletal, slow); K10371 troponin I, slow skeletal muscle
                    if ' [EC' in cols[15] and cols[15].endswith(']'):
                        m = re.search("\; (K\d+)\s+(.+) \[EC\:(.+)\]", cols[15])
                    else:
                        m = re.search("\; (K\d+)\s+(.+)", cols[15])

                    if m:
                        kegg_id = m.group(1)
                        product = m.group(2)
                        
                        if len(m.groups()) == 3:
                            ec_num = m.group(3)
                        else:
                            ec_num = None

                        annot.product_name = product
                        log_fh.write("INFO: {0}: Updated product name to '{1}' based on BLAST hit to KEGG accession '{2}'\n".format(this_qry_id, annot.product_name, accession))

                        if ec_num is not None and ec_num is not '':
                            ec = annotation.ECAnnotation(number=ec_num)
                            annot.add_ec_number(ec)

                        kegg_dbxref = annotation.Dbxref(db='KEGG', identifier=kegg_id)
                        annot.add_dbxref(kegg_dbxref)
                        
                # remember the ID we just saw
                last_qry_id = this_qry_id
Ejemplo n.º 19
0
def main():
    parser = argparse.ArgumentParser(
        description='Performs selected validation of a FASTA file')

    RESIDUE_LINE_LENGTH_LIMIT = 60

    ## output file to be written
    parser.add_argument('fasta_files',
                        metavar='N',
                        type=str,
                        nargs='*',
                        help='Pass one or more FASTA files')
    parser.add_argument('-o',
                        '--output_file',
                        type=str,
                        required=False,
                        default=None,
                        help='Optional path to an output file to be created')
    parser.add_argument('-l',
                        '--input_list',
                        type=str,
                        required=False,
                        default=None,
                        help='Optional path to a list of list of input files')
    parser.add_argument(
        '-erc',
        '--expected_record_count',
        type=int,
        required=False,
        default=None,
        help=
        'Optional count of records expected in the input.  An exception is raised if this is not matched.'
    )
    parser.add_argument(
        '-hl',
        '--homopolymer_limit',
        type=int,
        required=False,
        default=None,
        help=
        'Issues a warning for any sequences with a homopolymer of length > this'
    )
    parser.add_argument(
        '-cis',
        '--check_internal_stops',
        dest='check_internal_stops',
        action='store_true',
        help=
        'Meant when processing protein files, checks for internal * characters'
    )
    parser.set_defaults(check_internal_stops=False)
    args = parser.parse_args()

    ## output will either be a file or STDOUT
    fout = sys.stdout
    if args.output_file is not None:
        fout = open(args.output_file, 'wt')

    ## gather input
    input_files = args.fasta_files

    if args.input_list is not None:
        input_files.extend(utils.read_list_file(args.input_list))

    if len(input_files) == 0:
        raise Exception("ERROR: No input files defined")

    total_records = 0
    total_residues = 0
    error_count = 0
    warning_count = 0

    for ifile in input_files:
        fout.write("INFO: validating file {0}\n".format(ifile))
        line_number = 0
        ids_found = list()
        long_line_count = 0
        current_seq_length = None
        current_seq_id = None
        current_seq = ''
        current_homopolymer_base = None
        current_homopolymer_length = 0

        for line in open(ifile):
            line = line.rstrip()
            line_number += 1

            if line.startswith('>'):
                total_records += 1

                # found a new sequence entry - make sure the last one (if there was one) had residues
                if current_seq_length == 0:
                    fout.write(
                        "ERROR: Entry without residues found just before line {0} of {1}\n"
                        .format(line_number, ifile))
                    error_count += 1

                if args.check_internal_stops == True:
                    if '*' in current_seq[0:-1]:
                        fout.write(
                            "WARNING: Internal stops found within {0} of {1}\n"
                            .format(current_seq_id, ifile))
                        warning_count += 1

                current_seq = ''
                current_seq_length = 0
                m = re.match(">(\S+)", line)
                if m:
                    current_seq_id = m.group(1)
                    if current_seq_id in ids_found:
                        fout.write(
                            "ERROR: Duplicate ID ({2}) found on line {0} of {1}\n"
                            .format(line_number, ifile, current_seq_id))
                        error_count += 1
                    else:
                        ids_found.append(current_seq_id)
                else:
                    fout.write(
                        "ERROR: Record without ID on line {0} of {1}\n".format(
                            line_number, ifile))
                    error_count += 1

            elif line.startswith('#'):
                # warn about a comment line
                fout.write(
                    "WARNING: Comment detected on line {0} of {1}\n".format(
                        line_number, ifile))
                warning_count += 1

            else:
                # residue line
                total_residues += len(line)
                current_seq_length += len(line)
                current_seq += line

                if args.homopolymer_limit is not None:
                    for base in line:
                        if base == current_homopolymer_base:
                            current_homopolymer_length += 1
                        else:
                            if current_homopolymer_length > args.homopolymer_limit and current_homopolymer_base != 'N':
                                fout.write(
                                    "WARNING: Sequence ID {0} in file {1} contains a homopolymer run ({2}) of length {3}\n"
                                    .format(current_seq_id, ifile,
                                            current_homopolymer_base,
                                            current_homopolymer_length))
                                warning_count += 1

                            current_homopolymer_base = base
                            current_homopolymer_length = 1

                if '>' in line:
                    fout.write(
                        "ERROR: > character embedded in sequence residues on line {0} of {1}\n"
                        .format(line_number, ifile))
                    error_count += 1

                # not practical to print warnings for each line here, so we'll just do it once per file
                if len(line) > RESIDUE_LINE_LENGTH_LIMIT:
                    long_line_count += 1

        if current_seq_length == 0:
            fout.write(
                "ERROR: Entry without residues found on line {0} of {1}\n".
                format(line_number, ifile))
            error_count += 1

        if long_line_count > 0:
            fout.write(
                "WARNING: {0} residue line(s) were detected longer than {1} in file {2}\n"
                .format(long_line_count, RESIDUE_LINE_LENGTH_LIMIT, ifile))
            warning_count += long_line_count

    if args.expected_record_count is not None and args.expected_record_count != total_records:
        fout.write(
            "ERROR: Expected record count:{0} does not match what was found:{1}\n"
            .format(args.expected_record_count, total_records))
        error_count += 1

    if error_count == 0:
        if warning_count == 0:
            fout.write("INFO: All files in input set appear to be valid\n")
        else:
            fout.write(
                "INFO: All files in input set appear to be valid, but with {0} warning(s)\n"
                .format(warning_count))
    else:
        fout.write(
            "ERROR: total errors found in all files from input set: {0}, warning(s): {1}\n"
            .format(error_count, warning_count))
def parse_sprot_blast_evidence( log_fh, polypeptides, blast_org, blast_list, cursor, eval_cutoff, algorithm ):
    '''
    Reads a list file of NCBI BLAST evidence and a dict of polypeptides, populating
    each with Annotation evidence where appropriate.  Only attaches evidence if
    the product name is the default.

    Currently only considers the top BLAST hit for each query.
    '''
    if algorithm not in ['blast', 'rapsearch2']:
        raise Exception("algorithm argument must be either blast or rapsearch2")
    
    for file in utils.read_list_file(blast_list):
        last_qry_id = None
        
        for line in open(file):
            # 0 indexing is faster than startswith()
            if line[0] == '#':
                continue
            
            line = line.rstrip()
            cols = line.split("\t")
            this_qry_id = cols[0]

            if algorithm == 'blast':
                e_value = float(cols[19])
            elif algorithm == 'rapsearch2':
                ## rapsearch2 can actually report values outside of python's double range.  Handle these 
                try:
                    e_value = math.pow(10, float(cols[10]))
                except OverflowError:
                    print("WARN: couldn't handle E-value math on the following line (setting to 0):\n{0}".format(line))
                    e_value = 0

            # skip this line if it doesn't meet the cutoff
            if e_value > eval_cutoff:
                continue

            # the BLAST hits are sorted already with the top hit for each query first
            if last_qry_id != this_qry_id:
                annot = polypeptides[this_qry_id].annotation

                # get the accession from the cols[5]
                #  then process for known accession types
                if algorithm == 'blast':
                    accession = cols[5]
                elif algorithm == 'rapsearch2':
                    accession = cols[1]

                if accession.startswith('sp|'):
                    # pluck the second part out of this:
                    #  sp|Q4PEV8|EIF3M_USTMA
                    accession = accession.split('|')[1]

                assertions = get_uspdb_annot( accession, cursor )
                blast_org[this_qry_id] = assertions['organism']

                # save it, unless the gene product name has already changed from the default
                if annot.product_name == DEFAULT_PRODUCT_NAME:
                    if algorithm == 'blast':
                        # current hack until DB is updated:
                        # some products look like this:
                        #    Coatomer subunit gamma-2 OS=Bos taurus GN=COPG2 PE=2 SV=1
                        # take off everything after the OS=
                        m = re.search("(.+) OS=", cols[15])

                        if m:
                            annot.product_name = m.group(1)
                        else:
                            annot.product_name = cols[15]
                    elif algorithm == 'rapsearch2':
                        annot.product_name = assertions['product']

                    log_fh.write("INFO: {0}: Updated product name to '{1}' based on BLAST hit to SPROT accession '{2}'\n".format(this_qry_id, annot.product_name, accession))

                # if no EC numbers have been set, they can inherit from this
                if len(annot.ec_numbers) == 0:
                    for ec_annot in get_uspdb_ec_nums( accession, cursor ):
                        annot.add_ec_number(ec_annot)

                # if no GO IDs have been set, they can inherit from this
                if len(annot.go_annotations) == 0:
                    for go_annot in get_uspdb_go_terms( accession, cursor ):
                        annot.add_go_annotation(go_annot)

                # if no gene symbol has been set, it can inherit from this
                if annot.gene_symbol is None:
                    annot.gene_symbol = assertions['symbol']

                # remember the ID we just saw
                last_qry_id = this_qry_id
def main():
    parser = argparse.ArgumentParser(
        description='Put a description of your script here')

    ## output file to be written
    parser.add_argument('-i',
                        '--input_file',
                        type=str,
                        required=False,
                        help='Path to an input GBK file')
    parser.add_argument('-l',
                        '--input_list',
                        type=str,
                        required=False,
                        help='Path to an input GBK list file')
    parser.add_argument('-oid',
                        '--organism_id',
                        type=str,
                        required=True,
                        help='Organism ID being loaded')

    files = list()
    args = parser.parse_args()

    config = configparser.ConfigParser()
    config.read('gear.ini')

    if args.input_file is not None:
        files.append(args.input_file)
    elif args.input_list is not None:
        files = utils.read_list_file(args.input_list)
    else:
        raise Exception("ERROR: You must pass either -i or -l options")

    try:
        cnx = mysql.connector.connect(user=config['database']['user'],
                                      password=config['database']['password'],
                                      host=config['database']['host'],
                                      database=config['database']['name'])
    except mysql.connector.Error as err:
        if err.errno == errorcode.ER_ACCESS_DENIED_ERROR:
            print("Something is wrong with your user name or password")
        elif err.errno == errorcode.ER_BAD_DB_ERROR:
            print("Database does not exist")
        else:
            print(err)

    cursor = cnx.cursor()
    genes_by_ensembl_id = gear.cache_genes_by_ensembl_id(cursor)
    genes_by_sym = loaderutils.cache_genes_by_primary_gene_symbol(cursor,
                                                                  lower=True)

    for file in files:
        recompress = False

        # decompress the file if needed.
        if file.endswith(".gz"):
            os.system("gunzip {0}".format(file))
            recompress = True
            m = re.match("(.+\.([A-Za-z0-9]+)\.dat)\.gz", file)
            if m:
                file = m.group(1)
                chromosome = m.group(2)
            else:
                raise Exception(
                    "This should not have happened.  Failed to regex file path"
                )
        else:
            recompress = False

        # each gb_record is a SeqRecord object
        print("Processing file {0} ...".format(file))
        for gb_record in SeqIO.parse(open(file, "r"), "genbank"):
            mol_id = gb_record.name

            # each feat is a SeqFeature object
            for feat in gb_record.features:
                if 'gene' in feat.qualifiers:
                    ensembl_id = feat.qualifiers['gene'][0]

                    # remove any versioning
                    if '.' in ensembl_id:
                        ensembl_id = ensembl_id.split('.')[0]

                    if ensembl_id not in genes_by_ensembl_id:
                        continue

                if feat.type == 'gene':
                    gene_id = genes_by_ensembl_id[ensembl_id]

                    start = feat.location.start.position
                    stop = feat.location.end.position

                    print("{0}\t{1}\t{2}".format(ensembl_id, start, stop))

        # now recompress the file, if necessary
        if recompress:
            os.system("gzip {0}".format(file))

    cnx.commit()
    cursor.close()
    cnx.close()
def parse_tmhmm_evidence( log_fh, polypeptides, htab_list ):
    '''
    Reads a list of raw TMHMM evidence and a dict of polypeptides, adding annotation
    attributes where possible.

    Notes from the esteemed M Giglio:
    The GO term to use would be GO:0016021 "integral component of membrane"
    Or if you want to be more conservative you could go with GO:0016020 "membrane"
    
    Depends on the evidence. For the prok pipe we are pretty conservative, we require five TMHMM
    domains and then we call it putative integral membrane protein. 

    On ECO - in fact Marcus and I are the developers of ECO.  It is an ontology of evidence types.
    An annotation to an ECO term is used in conjunction with another annotation, like a GO term
    (but many other types of annotation can, and are, used with ECO). It provides additional
    information about the annotation. In fact for GO, the assignment of an evidence term along
    with a GO term is a required part of a GO annotation. (ECO terms are the "evidence codes" in GO.)

    INPUT: Expected TMHMM input (all HTML lines are skipped)
    # CHARM010_V2.mRNA.887 Length: 904
    # CHARM010_V2.mRNA.887 Number of predicted TMHs:  6
    # CHARM010_V2.mRNA.887 Exp number of AAs in TMHs: 133.07638
    # CHARM010_V2.mRNA.887 Exp number, first 60 AAs:  21.83212
    # CHARM010_V2.mRNA.887 Total prob of N-in:        0.99994
    # CHARM010_V2.mRNA.887 POSSIBLE N-term signal sequence
    CHARM010_V2.mRNA.887	TMHMM2.0	inside	     1    11
    CHARM010_V2.mRNA.887	TMHMM2.0	TMhelix	    12    34
    CHARM010_V2.mRNA.887	TMHMM2.0	outside	    35   712
    CHARM010_V2.mRNA.887	TMHMM2.0	TMhelix	   713   735
    CHARM010_V2.mRNA.887	TMHMM2.0	inside	   736   755
    CHARM010_V2.mRNA.887	TMHMM2.0	TMhelix	   756   773
    CHARM010_V2.mRNA.887	TMHMM2.0	outside	   774   782
    CHARM010_V2.mRNA.887	TMHMM2.0	TMhelix	   783   805
    CHARM010_V2.mRNA.887	TMHMM2.0	inside	   806   809
    CHARM010_V2.mRNA.887	TMHMM2.0	TMhelix	   810   832
    CHARM010_V2.mRNA.887	TMHMM2.0	outside	   833   871
    CHARM010_V2.mRNA.887	TMHMM2.0	TMhelix	   872   894
    CHARM010_V2.mRNA.887	TMHMM2.0	inside	   895   904
    '''
    # The number of helices spanning the membrane required before counted as a membrane protein
    MIN_HELICAL_SPANS = 3

    # For successful matches, this is the product name which gets applied
    GENE_PRODUCT_NAME = 'Putative integral membrane protein'
    
    for file in utils.read_list_file(htab_list):
        last_qry_id = None
        current_helix_count = 0
        
        for line in open(file):
            if line.startswith('<'): continue
            m = re.match("# (.+?)\s+Length: \d+", line)

            if m:
                current_id = m.group(1)
                
                # purge previous result
                if current_helix_count >= MIN_HELICAL_SPANS:
                    annot = polypeptides[last_qry_id].annotation

                    if annot.product_name == DEFAULT_PRODUCT_NAME:
                        annot.product_name = GENE_PRODUCT_NAME
                        log_fh.write("INFO: {0}: Updated product name to '{1}' because it had {2} TMHelix domains predicted by TMHMM\n".format(last_qry_id, annot.product_name, current_helix_count))
                    else:
                        log_fh.write("INFO: {0}: TMHMM predicted {1} TMHelix domains but gene product name unchanged because of previous assignment\n".format(last_qry_id, current_helix_count))

                    ## we add the GO terms no matter what
                    annot.add_go_annotation(annotation.GOAnnotation(go_id='0016021'))

                # reset
                last_qry_id = current_id
                current_helix_count = 0
                continue

            cols = line.split()
            if len(cols) == 5 and cols[2] == 'TMhelix':
                current_helix_count += 1
def parse_uniref100_blast_evidence( log_fh, polypeptides, blast_list, cursor, eval_cutoff, algorithm, uniref100_fasta_path ):
    '''
    Reads a list file of NCBI BLAST evidence and a dict of polypeptides, populating
    each with Annotation evidence where appropriate.  Only attaches evidence if
    the product name is the default.

    Currently only considers the top BLAST hit for each query.
    '''
    if algorithm not in ['blast', 'rapsearch2']:
        raise Exception("algorithm argument must be either blast or rapsearch2")

    ## need to load the UniRef100 to TREMBL accession lookup from teh FASTA
    # like UniRef100_K1T359 -> K1T359_9ZZZZ
    uniref2acc = dict()
    print("INFO: parsing UniRef100 FASTA headers for annotation")
    if algorithm == 'rapsearch2':
        for line in open(uniref100_fasta_path):
            if line[0] == '>':
                m = re.match("\>(\S+) (.+) n=.+RepID=(\S+)", line)
                if m:
                    uniref2acc[m.group(1)] = {'acc': m.group(3), 'prod': m.group(2)}
    
    for file in utils.read_list_file(blast_list):
        last_qry_id = None
        
        for line in open(file):
            # 0 indexing is faster than startswith()
            if line[0] == '#':
                continue
            
            line = line.rstrip()
            cols = line.split("\t")
            this_qry_id = cols[0]

            # We're going to ignore any lines which have a few keywords in the name
            # First character left off for initcap reasons
            if algorithm == 'blast':
                skip_products = ['ncharacterized', 'ypothetical', 'enomic scaffold']
                skip = False
                for keyword in skip_products:
                    if keyword in cols[15]:
                        skip = True

                if skip == True:
                    continue

            if algorithm == 'blast':
                e_value = float(cols[19])
            elif algorithm == 'rapsearch2':
                ## rapsearch2 can actually report values outside of python's double range.  Handle these 
                try:
                    e_value = math.pow(10, float(cols[10]))
                except OverflowError:
                    print("WARN: couldn't handle E-value math on the following line (setting to 0):\n{0}".format(line))
                    e_value = 0

            # skip this line if it doesn't meet the cutoff
            if e_value > eval_cutoff:
                continue

            # the BLAST hits are sorted already with the top hit for each query first
            if last_qry_id != this_qry_id:
                annot = polypeptides[this_qry_id].annotation

                # get the accession then process for known accession types
                accession = None

                if algorithm == 'blast':
                    # UniRef100_K1T359 -> K1T359_9ZZZZ
                    m = re.search("RepID\=(\S+)", cols[15])
                    if m:
                        accession = m.group(1)
                    else:
                        raise Exception("ERROR: Unexpected product format in UniRef BLAST results: {0}".format(cols[15]))
                elif algorithm == 'rapsearch2':
                    accession = uniref2acc[cols[1]]['acc']

                assertions = get_uniref_annot( accession, cursor )

                # save it, unless the gene product name has already changed from the default
                if annot.product_name == DEFAULT_PRODUCT_NAME:
                    if algorithm == 'blast':
                        # these hits look like this:
                        #  AD-specific glutamate dehydrogenase n=1 Tax=Ceriporiopsis subvermispora (strain B) RepID=M2RLB9_CERS8
                        m = re.match("(.+) n\=.+", cols[15])
                        if m:
                            annot.product_name = m.group(1)
                        else:
                            raise Exception("ERROR: Unexpected product format in UniRef BLAST results: {0}".format(cols[15]))

                        log_fh.write("INFO: {0}: Updated product name to '{1}' based on BLAST hit to UniRef100 accession '{2}'\n".format(this_qry_id, annot.product_name, accession))
                        
                    elif algorithm == 'rapsearch2':
                        annot.product_name = uniref2acc[cols[1]]['prod']
                        
                # if no EC numbers have been set, they can inherit from this
                if len(annot.ec_numbers) == 0:
                    for ec_annot in get_uniref_ec_nums( accession, cursor ):
                        annot.add_ec_number(ec_annot)

                # if no GO IDs have been set, they can inherit from this
                if len(annot.go_annotations) == 0:
                    for go_annot in get_uniref_go_terms( accession, cursor ):
                        annot.add_go_annotation(go_annot)

                # if no gene symbol has been set, it can inherit from this
                if annot.gene_symbol is None:
                    annot.gene_symbol = assertions['symbol']
                    
                # remember the ID we just saw
                last_qry_id = this_qry_id