Beispiel #1
0
def main():
    option_parser, opts, args =\
    parse_command_line_parameters(**script_info)

    if opts.counts_dir is None:
        raise RuntimeError('No data available')

    if os.path.exists(opts.save_path) and opts.force_overwrite is False:
        raise RuntimeError(
            'Output BED file exists and overwriting not enabled. Exiting.\n')

    output_bed_file = open(opts.save_path, 'w')

    dir_list = os.listdir(opts.counts_dir)

    for chrom_file in dir_list:
        if chrom_file[-7:] == '.txt.gz':
            chrom_number = (chrom_file.lstrip('chr').rstrip('.txt.gz'))
            print 'Chromosome file name: %s, chromosome number: %s\n' % (
                chrom_file, chrom_number)

            read_file = os.path.join(opts.counts_dir, chrom_file)

            make_bed_entries(read_file, chrom_number, opts.feature_name,
                             output_bed_file, opts.max_read_length,
                             opts.count_max_length, chrom_number)
        else:
            print "\nSkipping file: %s as not data file.\n" % (chrom_file)

    output_bed_file.close()
Beispiel #2
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    taxlookup = load_consensus_map(open(opts.ref_taxonomy_map), False)
    cs_results = parse_cs_chimeras(open(opts.input_cs))
    b3_results = parse_b3_chimeras(open(opts.input_bellerophon))
    
    output = open(opts.output,'w')
    output.write("#accession\treason\tnote\tnote\n")
    overlap = get_overlap(b3_results, cs_results)
    for id_ in overlap:
        output.write("%s\tFound by both Bellerophon and ChimeraSlayer\n" % id_)

    for id_, score, parent_a, parent_b in b3_results:
        if id_ in overlap:
            continue
        if determine_taxon_conflict(taxlookup, parent_a, parent_b):
            o = [id_,"Class conflict found by Bellerophon"]
            o.append("%s: %s" % (parent_a, '; '.join(taxlookup[parent_a])))
            o.append("%s: %s" % (parent_b, '; '.join(taxlookup[parent_b])))
            output.write('\t'.join(o))
            output.write('\n')

    for id_, parent_a, parent_b in cs_results:
        if id_ in overlap:
            continue
        if determine_taxon_conflict(taxlookup, parent_a, parent_b):
            o = [id_,"Class conflict found by ChimeraSlayer"]
            o.append("%s: %s" % (parent_a, '; '.join(taxlookup[parent_a])))
            o.append("%s: %s" % (parent_b, '; '.join(taxlookup[parent_b])))
            output.write('\t'.join(o))
            output.write('\n')
Beispiel #3
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    taxlookup = load_consensus_map(open(opts.ref_taxonomy_map), False)
    cs_results = parse_cs_chimeras(open(opts.input_cs))
    b3_results = parse_b3_chimeras(open(opts.input_bellerophon))

    output = open(opts.output, 'w')
    output.write("#accession\treason\tnote\tnote\n")
    overlap = get_overlap(b3_results, cs_results)
    for id_ in overlap:
        output.write("%s\tFound by both Bellerophon and ChimeraSlayer\n" % id_)

    for id_, score, parent_a, parent_b in b3_results:
        if id_ in overlap:
            continue
        if determine_taxon_conflict(taxlookup, parent_a, parent_b):
            o = [id_, "Class conflict found by Bellerophon"]
            o.append("%s: %s" % (parent_a, '; '.join(taxlookup[parent_a])))
            o.append("%s: %s" % (parent_b, '; '.join(taxlookup[parent_b])))
            output.write('\t'.join(o))
            output.write('\n')

    for id_, parent_a, parent_b in cs_results:
        if id_ in overlap:
            continue
        if determine_taxon_conflict(taxlookup, parent_a, parent_b):
            o = [id_, "Class conflict found by ChimeraSlayer"]
            o.append("%s: %s" % (parent_a, '; '.join(taxlookup[parent_a])))
            o.append("%s: %s" % (parent_b, '; '.join(taxlookup[parent_b])))
            output.write('\t'.join(o))
            output.write('\n')
Beispiel #4
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    input_gbs = opts.input_gbs.split(',')
    output_dir = opts.output_dir
    verbose = opts.verbose
    tag = opts.tag
    existing_fp = opts.existing
    max_failures = opts.max_failures
    
    makedirs(output_dir)
    logger = WorkflowLogger(generate_log_fp(output_dir), script_name=argv[0])

    observed_records = parse_column(open(existing_fp))

    sequences_fp = os.path.join(output_dir, '%s_sequences.fasta.gz' % tag)
    gg_records_fp = os.path.join(output_dir, '%s_ggrecords.txt.gz' % tag)
    obs_records_fp = os.path.join(output_dir, '%s_obsrecords.txt.gz' % tag)
    
    sequences = open(sequences_fp,'w')
    gg_records = open(gg_records_fp, 'w')
    obs_records = open(obs_records_fp, 'w')
    
    seen = set([])
    for gb_fp in input_gbs:
        logline = log_f("Start parsing of %s..." % gb_fp)
        logger.write(logline)

        if verbose:
            stdout.write(logline)

        records = MinimalGenbankParser(open(gb_fp))
        
        failure_count = 0
        alpha = set(['A','T','G','C',
                     'a','t','g','c',
                     'N','n',
                     'R','Y','S','M',
                     'r','y','s','m',
                     'K','k','W','w',
                     'V','v','H','h','B','b','D','d'])

        while True and (failure_count < max_failures):
            # gracefully handle parser errors to a limit
            try:
                next_record = records.next()
            except PartialRecordError, e:
                failure_count += 1
                continue
            except StopIteration:
                break
            except Exception, e:
                logline = log_f("Caught: %s, previous accession: %s" % (e, accession))
                logger.write(logline)
                if verbose:
                    stdout.write(logline)
                failure_count += 1
Beispiel #5
0
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)
    
    if None in (opts.hostname, opts.user, opts.passwd):
        assert len(set((opts.hostname, opts.user, opts.passwd))) == 1,\
            'You must provide all MySQL options, or none at all.'
    
    if opts.hostname is not None:
        account = HostAccount(opts.hostname,opts.user,opts.passwd)
    elif 'ENSEMBL_ACCOUNT' in os.environ:
        h, u, p = os.environ['ENSEMBL_ACCOUNT'].split()
        account = HostAccount(h,u,p)
    else:
        account = None
    
    if opts.test_run:
        print account
    
    outdir = os.path.abspath(opts.outdir)
    if not os.path.exists(outdir):
        print 'FAIL: %s directory does not exist' % outdir
        exit(-1)
    
    if not opts.by_chrom:
        outfile_name = os.path.join(outdir, '%s-%s.fasta' % (opts.species, opts.release))
        if not opts.test_run:
            outfile = open(outfile_name, 'w')
    
    if opts.test_run:
        print 'Will write to: %s' % outdir
        if not opts.by_chrom:
            print outfile_name
    
    for chrom in get_chrom_seqs(opts.species, opts.release, account,
                                debug=opts.test_run):
        fasta = chrom.toFasta()
        
        if opts.by_chrom:
            outfile_name = os.path.join(outdir, '%s.fasta' % chrom.Name)
        
        if opts.test_run:
            print 'Will write to: %s' % outfile_name
            break
        
        if opts.by_chrom:
            outfile = open(outfile_name, 'w')
        
        outfile.write(fasta+'\n')
        
        if opts.by_chrom:
            outfile.close()
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    taxlookup = load_consensus_map(open(opts.ref_taxonomy_map))
    uchime_results = parse_uchime_chimeras(open(opts.input_uchime))
    
    output = open(opts.output,'w')
    output.write("#accession\treason\tnote\tnote\n")

    for id_, score, parent_a, parent_b in uchime_results:
        if determine_taxon_conflict(taxlookup, parent_a, parent_b):
            o = [id_,"Class conflict found by UCHIME"]
            o.append("%s: %s" % (parent_a, '; '.join(taxlookup[parent_a])))
            o.append("%s: %s" % (parent_b, '; '.join(taxlookup[parent_b])))
            output.write('\t'.join(o))
            output.write('\n')
Beispiel #7
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    taxlookup = load_consensus_map(open(opts.ref_taxonomy_map))
    uchime_results = parse_uchime_chimeras(open(opts.input_uchime))

    output = open(opts.output, 'w')
    output.write("#accession\treason\tnote\tnote\n")

    for id_, score, parent_a, parent_b in uchime_results:
        if determine_taxon_conflict(taxlookup, parent_a, parent_b):
            o = [id_, "Class conflict found by UCHIME"]
            o.append("%s: %s" % (parent_a, '; '.join(taxlookup[parent_a])))
            o.append("%s: %s" % (parent_b, '; '.join(taxlookup[parent_b])))
            output.write('\t'.join(o))
            output.write('\n')
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)
    
    # setup DB connection
    cred = Credentials()
    con = connect(cred.liveMetadataDatabaseConnectionString)
    cursor = con.cursor()
    existing_kit_ids = get_used_kit_ids(cursor)

    if opts.input:
        kit_passwd_map, kit_barcode_map = preassigned_kits(opts, cursor, 
                                                           existing_kit_ids)
    else:
        if not opts.tag or not opts.number_of_kits \
                        or not opts.swabs_per_kit:
            option_parser.error("Must specify tag, number of samples and number of swabs")
        kit_passwd_map, kit_barcode_map = unassigned_kits(opts, cursor, 
                                                          existing_kit_ids)
    
    f = open(opts.output + '.printouts', 'w')
    f.write('\n'.join(get_printout_data(kit_passwd_map, kit_barcode_map)))
    f.write('\n')
    f.close()
Beispiel #9
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    # if we already have these records, then we do not need to reobtain them
    if opts.existing_gb:
        existing_gis = set([l.strip() for l in open(opts.existing_gb)])
    else:
        existing_gis = set([])

    if opts.verbose:
        print "Number of existing GIs: %d" % len(existing_gis)

    if opts.possible_new_gb_out is None:
        option_parser.error("Need to specify --possible-new-gb-output")

    if opts.cached_ids:
        possible_gis = set([l.strip() for l in open(opts.cached_ids)])
    else:
        #ncbi_record_queries = ['16S','18S','small subunit','rrna[fkey]','ribosomal']
        ncbi_record_queries = ['16S AND tm7']
        # grab all the ids
        possible_gis = set([])
        for query in ncbi_record_queries:
            if opts.verbose:
                cur_size = len(possible_gis)
            possible_gis.update(esearch(query, retmax=10000000))

            if opts.verbose:
                print "Query %s added %d to set" % (query, len(possible_gis) - cur_size)

    # drop out any existing ids
    possible_gis = possible_gis - existing_gis

    if opts.verbose:
        print "Total number of GIs to query: %d" % len(possible_gis)
   
    chunk_count = 0
    total_bytes = 0
    if opts.use_gz:
        poss_output = open_gz(opts.possible_new_gb_out,'w')
    else:
        poss_output = open(opts.possible_new_gb_out,'w')
    
    collected = set([])

    retries = 0
    while possible_gis and retries < 100:
        try:
            for chunk in bulk_efetch(possible_gis):
                chunk_count += 1
                total_bytes += len(chunk)

                # Occasionally, and silently, NCBI corrupts records. 
                if '<html>' in chunk:
                    if verbose:
                        print "Erroneous record in chunk, disregarding full chunk"
                        continue

                # pullout the GIs
                records = [] 
                for l in chunk.splitlines():
                    if l.startswith('VERSION'):
                        records.append(l.split(':')[1])

                if opts.verbose:
                    print "%s - retry: %d, Chunk %d, covering %d records, writing %d bytes, %d written in total" % \
                        (time.strftime("%m-%d-%y %H:%M:%S"), retries, chunk_count, len(records), len(chunk), total_bytes)
                poss_output.write(chunk)
                collected.update(set(records))
        except Exception, e:
            retries += 1
            print "Caught exception: ", e
        possible_gis = possible_gis - collected
        collected = set([])
        
        possible_gis_at_retry = open('possible_retries_at_retry_%d.txt.gz' % retries, 'w')
        possible_gis_at_retry.write('\n'.join(possible_gis))
        possible_gis_at_retry.close()
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    gg_records_fp = opts.gg_records
    output_dir = opts.output_dir
    verbose = opts.verbose
    existing_fp = opts.existing
    tag = opts.tag
    gg_id = opts.starting_gg_id

    invariants = parse_invariants(open(opts.invariants))

    makedirs(output_dir)
    logger = WorkflowLogger(generate_log_fp(output_dir), script_name=argv[0])

    # gg records are not going out as gzip as python's gzip is slow relative
    # to native linux gzip and doesn't compress as well out the door (latter
    # probably fixable)
    output_gg_fp = os.path.join(output_dir, "%s.records.txt" % tag)
    output_map_fp = os.path.join(output_dir, "%s.mapping.txt.gz" % tag)
    output_gg_noggid_fp = os.path.join(output_dir, "%s.records.noggid.txt" \
                                                    % tag)

    existing_records = parse_column(open(existing_fp))

    #records = dict([(r['ncbi_acc_w_ver'], r) \
    #                for r in MinimalGreengenesParser(open(gg_records_fp))])

    for record in MinimalGreengenesParser(open(gg_records_fp)):
        acc = record['ncbi_acc_w_ver']

        ### NEED DOMAIN!
        aln = filter(None, [get_indexed_sequence(i, acc) for i in aligned])
        noaln = filter(None, [get_indexed_sequence(i, acc) for i in unaligned])

        if not aln:
            logline = log_f("GG record %s does not have aligned seq!" % acc)
            logger.write(logline)
            if verbose:
                stdout.write(logline)
            continue

        if not unaln:
            logline = log_f("GG record %s does not have aligned seq!" % acc)
            logger.write(logline)
            if verbose:
                stdout.write(logline)
            continue

        # if > 1 rec, complain

        for aln_id, aln_seq in MinimalFastaParser(open(f)):
            id_ = aln_id.split()[0]  # strip of any comments
            record = records.get(id_, None)

            if record is None:
                logline = log_f("Aligned seq %s does not have a GG record" %
                                id_)
                logger.write(logline)
                if verbose:
                    stdout.write(logline)
                continue

            if id_ in existing_records:
                logline = log_f("%s has previously been observed!" % id_)
                logger.write(logline)
                if verbose:
                    stdout.write(logline)
                continue

            if record['gg_id'] is not None:
                logline = log_f("%s already has gg_id %d!" %\
                                    (id_,record['gg_id']))
                logger.write(logline)
                if verbose:
                    stdout.write(logline)
                continue

            record['gg_id'] = gg_id
            if domain != 'eukarya':
                record['prokMSA_id'] = gg_id
            gg_id += 1

            inv_score = calc_invariant(seq, invariants)
            non_ACGT = calc_nonACGT(seq)

            record['perc_ident_to_invariant_core'] = inv_score
            record['non_ACGT_percent'] = non_ACGT
            record['aligned_seq'] = seq
            record['n_pos_aligned'] = len(seq) - seq.count('-')

    for f in opts.unaligned.split(','):
        logline = log_f("Parsing %s..." % f)
        logger.write(logline)
        if verbose:
            stdout.write(logline)

        domain = get_domain(f)

        for unaln_id, unaln_seq in MinimalFastaParser(open(f)):
            id_ = unaln_id.split()[0]  # strip off any comments
            record = records.get(id_, None)

            if record is None:
                logline = log_f("Unaligned seq %s does not have a GG record" %\
                                 id_)
                logger.write(logline)
                if verbose:
                    stdout.write(logline)
                continue

            # a gg_id should be assigned while trolling the alignment seqs
            if record['gg_id'] is None:
                logline = log_f("%s should have a gg_id by now!" % (id_))
                logger.write(logline)
                if verbose:
                    stdout.write(logline)
                continue

            record['unaligned_seq'] = seq
            record['n_pos_unaligned'] = len(seq)

    logline = log_f("Beginning output...")
    logger.write(logline)
    if verbose:
        stdout.write(logline)

    output_map = open(output_map_fp, 'w')
    output_gg = open(output_gg_fp, 'w')
    output_gg_noggid = open(output_gg_noggid_fp, 'w')
    output_gg_broken = open(output_gg_broken_fp, 'w')

    for record in records.items():
        if record['gg_id'] is None:
            write_gg_record(output_gg_noggid, record)
        else:
            try:
                record.sanityCheck()
            except:
                write_gg_record(output_gg_broken, record)
            else:
                write_gg_record(output_gg, record)
                output_map.write("%s\t%s\n" %
                                 (record['gg_id'], record['ncbi_acc_w_ver']))
    output_gg.close()
Beispiel #11
0
def main():
    from optparse import make_option
    from cogent.util.misc import parse_command_line_parameters
    from sys import exit, stdout
    
    script_info = {}
    script_info['brief_description'] = "Parse raw Greengenes 16S records"
    script_info['script_description'] = """Parse out specific fields from raw Greengenes 16S records. These records are rich but often only a subset of each record is required for downstream processing."""
    script_info['script_usage'] = []
    script_info['script_usage'].append(("""Example:""","""Greengenes taxonomy and raw sequences are needed:""","""python greengenes.py -i greengenes16SrRNAgenes.txt -o gg_seq_and_tax.txt -f prokMSA_id,greengenes_tax_string,aligned_seq"""))
    script_info['script_usage'].append(("""Example:""","""Spitting out the available fields from Greengenes:""","""python greengenes.py -i greengenes16SrRNAgenes.txt --print-fields"""))
    script_info['output_description'] = """The resulting output file will contain a header that is prefixed with a # and delimited by the specified delimiter (default is tab). All records will follow in the same order with the same delimiter. It is possible for some key/value pairs within a record to lack a value. In this case, the value placed will be ''"""
    script_info['required_options']=[make_option('--input','-i',dest='input',\
                  help='Greengenes Records')]
    script_info['optional_options']=[\
               make_option('--output','-o',dest='output',help='Output file'),
               make_option('--fields','-f',dest='fields',\
                  help='Greengenes fields to keep'),
               make_option('--delim','-d',dest='delim',help='Output delimiter',\
                       default="\t"),
               make_option('--list-of-ids','-l',dest='ids',default=None,\
                   help='File with a single column list of ids to retrieve'),
               make_option('--print-fields','-p',dest='print_fields',\
                  help='Prints available fields from first Greengenes Record',\
                  action='store_true',default=False)]
    script_info['version'] = __version__
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    if opts.print_fields:
        gg_parser = MinimalGreengenesParser(open(opts.input))
        rec = gg_parser.next()
        print '\n'.join(sorted(rec.keys()))
        exit(0)

    if not opts.fields:
        print option_parser.usage()
        print
        print "Greengenes fields must be specified!"
        exit(1)

    if not opts.output:
        output = stdout
    else:
        output = open(opts.output,'w')

    fields = opts.fields.split(',')
    output.write("#%s\n" % opts.delim.join(fields))

    if opts.ids:
        ids = set([l.strip() for l in open(opts.ids, 'U')])
    else:
        ids = None

    gg_parser = SpecificGreengenesParser(open(opts.input), fields, ids)

    for record in gg_parser:
        output.write(opts.delim.join(record))
        output.write('\n')

    if opts.output:
        output.close()
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    gg_records_fp = opts.gg_records
    output_dir = opts.output_dir
    verbose = opts.verbose
    existing_fp = opts.existing
    tag = opts.tag
    gg_id = opts.starting_gg_id


    invariants = parse_invariants(open(opts.invariants))

    makedirs(output_dir)
    logger = WorkflowLogger(generate_log_fp(output_dir), script_name=argv[0])

    # gg records are not going out as gzip as python's gzip is slow relative
    # to native linux gzip and doesn't compress as well out the door (latter 
    # probably fixable)
    output_gg_fp = os.path.join(output_dir, "%s.records.txt" % tag)
    output_map_fp = os.path.join(output_dir, "%s.mapping.txt.gz" % tag)
    output_gg_noggid_fp = os.path.join(output_dir, "%s.records.noggid.txt" \
                                                    % tag)
    
    existing_records = parse_column(open(existing_fp))
    
    #records = dict([(r['ncbi_acc_w_ver'], r) \
    #                for r in MinimalGreengenesParser(open(gg_records_fp))])
    
    for record in MinimalGreengenesParser(open(gg_records_fp)):
        acc = record['ncbi_acc_w_ver']

        ### NEED DOMAIN!
        aln = filter(None, [get_indexed_sequence(i, acc) for i in aligned])
        noaln = filter(None, [get_indexed_sequence(i, acc) for i in unaligned])
        
        if not aln:
            logline = log_f("GG record %s does not have aligned seq!" % acc)
            logger.write(logline)
            if verbose:
                stdout.write(logline)
            continue

        if not unaln:
            logline = log_f("GG record %s does not have aligned seq!" % acc)
            logger.write(logline)
            if verbose:
                stdout.write(logline)
            continue

        # if > 1 rec, complain

        
        for aln_id, aln_seq in MinimalFastaParser(open(f)):
            id_ = aln_id.split()[0] # strip of any comments
            record = records.get(id_, None)

            if record is None:
                logline = log_f("Aligned seq %s does not have a GG record" % id_)
                logger.write(logline)
                if verbose:
                    stdout.write(logline)
                continue

            if id_ in existing_records:
                logline = log_f("%s has previously been observed!" % id_)
                logger.write(logline)
                if verbose:
                    stdout.write(logline)
                continue

            if record['gg_id'] is not None:
                logline = log_f("%s already has gg_id %d!" %\
                                    (id_,record['gg_id']))
                logger.write(logline)
                if verbose:
                    stdout.write(logline)
                continue
        
            record['gg_id'] = gg_id
            if domain != 'eukarya':
                record['prokMSA_id'] = gg_id
            gg_id += 1

            inv_score = calc_invariant(seq, invariants)
            non_ACGT = calc_nonACGT(seq)

            record['perc_ident_to_invariant_core'] = inv_score
            record['non_ACGT_percent'] = non_ACGT
            record['aligned_seq'] = seq
            record['n_pos_aligned'] = len(seq) - seq.count('-')

    for f in opts.unaligned.split(','):
        logline = log_f("Parsing %s..." % f)
        logger.write(logline)
        if verbose:
            stdout.write(logline)

        domain = get_domain(f)

        for unaln_id, unaln_seq in MinimalFastaParser(open(f)):
            id_ = unaln_id.split()[0] # strip off any comments
            record = records.get(id_, None)

            if record is None:
                logline = log_f("Unaligned seq %s does not have a GG record" %\
                                 id_)
                logger.write(logline)
                if verbose:
                    stdout.write(logline)
                continue
    
            # a gg_id should be assigned while trolling the alignment seqs
            if record['gg_id'] is None:
                logline = log_f("%s should have a gg_id by now!" % (id_))
                logger.write(logline)
                if verbose:
                    stdout.write(logline)
                continue

            record['unaligned_seq'] = seq
            record['n_pos_unaligned'] = len(seq)
    
    logline = log_f("Beginning output...")
    logger.write(logline)
    if verbose:
        stdout.write(logline)

    output_map = open(output_map_fp,'w')
    output_gg = open(output_gg_fp,'w')
    output_gg_noggid = open(output_gg_noggid_fp, 'w')
    output_gg_broken = open(output_gg_broken_fp, 'w')

    for record in records.items():
        if record['gg_id'] is None:
            write_gg_record(output_gg_noggid, record)
        else:
            try:
                record.sanityCheck()
            except:
                write_gg_record(output_gg_broken, record)
            else:
                write_gg_record(output_gg, record)
                output_map.write("%s\t%s\n" % (record['gg_id'], 
                                               record['ncbi_acc_w_ver']))
    output_gg.close()
    
    by_length = {}
    for ((name, email), (kits, codes)) in mapping.items():
        n_kits = len(kits)

        if n_kits not in by_length:
            by_length[n_kits] = []

        new_rec = [name, email]
        new_rec.extend(kits)
        new_rec.extend(codes)
        by_length[n_kits].append(new_rec)
    return by_length

if __name__ == '__main__':
    option_parser, opts, args = parse_command_line_parameters(**script_info)
    con = connect(user=opts.user, password=opts.password, dsn=opts.dsn)
    cur = con.cursor()

    if opts.full_query:
        cur.execute(FULL_QUERY)
    else:
        cur.execute(UNVER_QUERY)

    results = cur.fetchall()
    collapsed = collapse_names(results)

    for n_kits in sorted(collapsed.keys()):
        f = open(opts.outfile_fp + '_%d_kits.txt' % n_kits, 'w')
        f.write("#name\temail\t")
        f.write('\t'.join(["kit_id"] * n_kits))
Beispiel #14
0
def main():
    """ dump stableIDs for expressing genes in a study based on commonality """
    script_info = set_environment()
    option_parser, opts, args =\
            parse_command_line_parameters(**script_info)

    rr = RunRecord()

    if opts.sample1 is None:
        raise RuntimeError('No samples given')

    # These will hold the ids we want to intersect, etc
    sample1_ids = set()
    sample2_ids = set()
    sample3_ids = set()

    # Get all the genes and build ensembl ID sets
    session = _create_session()
    sample1_genes, rr = getExpressedGenes(
        session,
        opts.sample1,
        opts.sample1_type,
        opts.m1,
        opts.sample_extremes,
        ignore_bulk=opts.ignore_bulk,
        ignore_top_extreme=opts.ignore_top_extreme,
        ignore_bottom_extreme=opts.ignore_bottom_extreme,
        rr=rr)
    for gene in sample1_genes:
        sample1_ids.add(gene.ensembl_id)
    session.close()

    session = _create_session()
    sample2_genes, rr = getExpressedGenes(
        session,
        opts.sample2,
        opts.sample2_type,
        opts.m2,
        opts.sample_extremes,
        ignore_bulk=opts.ignore_bulk,
        ignore_top_extreme=opts.ignore_top_extreme,
        ignore_bottom_extreme=opts.ignore_bottom_extreme,
        rr=rr)
    for gene in sample2_genes:
        sample2_ids.add(gene.ensembl_id)
    session.close()

    if opts.sample3 is not None:
        session = _create_session()
        sample3_genes, rr = getExpressedGenes(
            session,
            opts.sample3,
            opts.sample3_type,
            opts.m3,
            opts.sample_extremes,
            ignore_bulk=opts.ignore_bulk,
            ignore_top_extreme=opts.ignore_top_extreme,
            ignore_bottom_extreme=opts.ignore_bottom_extreme,
            rr=rr)
        for gene in sample3_genes:
            sample3_ids.add(gene.ensembl_id)
        session.close()

    # Find the IDs we're interested in based on sample relationship
    comparison_type = opts.comparison_type.split(':')[0]
    output_id_set, rr = compare_ids(sample1_ids, sample2_ids, sample3_ids,
                                    comparison_type, rr)

    # Narrow search if needed by top genes
    if opts.num_genes is not None:
        output_id_set, rr = restrict_by_num_genes(
            output_id_set, opts.num_genes, opts.expression_sample1,
            opts.expression_sample2, opts.expression_sample3,
            opts.favoured_expression_sample, rr)

    rr.addInfo('gene_overlap', 'Total genes in output', len(output_id_set))
    # now save to file
    outfile = open(opts.genefile, 'w')
    # Add the standard header so that we can import with add_expression_db.py
    outfile.write('gene\n')
    for id in output_id_set:
        outfile.write(str(id) + '\n')
    outfile.close()
    rr.display()
Beispiel #15
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    # if we already have these records, then we do not need to reobtain them
    if opts.existing_gb:
        existing_gis = set([l.strip() for l in open(opts.existing_gb)])
    else:
        existing_gis = set([])

    if opts.verbose:
        print "Number of existing GIs: %d" % len(existing_gis)

    if opts.possible_new_gb_out is None:
        option_parser.error("Need to specify --possible-new-gb-output")

    if opts.cached_ids:
        possible_gis = set([l.strip() for l in open(opts.cached_ids)])
    else:
        # ncbi_record_queries = ['16S','18S','small subunit','rrna[fkey]','ribosomal']
        ncbi_record_queries = ["16S AND tm7"]
        # grab all the ids
        possible_gis = set([])
        for query in ncbi_record_queries:
            if opts.verbose:
                cur_size = len(possible_gis)
            possible_gis.update(esearch(query, retmax=10000000))

            if opts.verbose:
                print "Query %s added %d to set" % (query, len(possible_gis) - cur_size)

    # drop out any existing ids
    possible_gis = possible_gis - existing_gis

    if opts.verbose:
        print "Total number of GIs to query: %d" % len(possible_gis)

    chunk_count = 0
    total_bytes = 0
    if opts.use_gz:
        poss_output = open_gz(opts.possible_new_gb_out, "w")
    else:
        poss_output = open(opts.possible_new_gb_out, "w")

    collected = set([])

    retries = 0
    while possible_gis and retries < 100:
        try:
            for chunk in bulk_efetch(possible_gis):
                chunk_count += 1
                total_bytes += len(chunk)

                # Occasionally, and silently, NCBI corrupts records.
                if "<html>" in chunk:
                    if verbose:
                        print "Erroneous record in chunk, disregarding full chunk"
                        continue

                # pullout the GIs
                records = []
                for l in chunk.splitlines():
                    if l.startswith("VERSION"):
                        records.append(l.split(":")[1])

                if opts.verbose:
                    print "%s - retry: %d, Chunk %d, covering %d records, writing %d bytes, %d written in total" % (
                        time.strftime("%m-%d-%y %H:%M:%S"),
                        retries,
                        chunk_count,
                        len(records),
                        len(chunk),
                        total_bytes,
                    )
                poss_output.write(chunk)
                collected.update(set(records))
        except Exception, e:
            retries += 1
            print "Caught exception: ", e
        possible_gis = possible_gis - collected
        collected = set([])

        possible_gis_at_retry = open("possible_retries_at_retry_%d.txt.gz" % retries, "w")
        possible_gis_at_retry.write("\n".join(possible_gis))
        possible_gis_at_retry.close()
Beispiel #16
0
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)

    if ',' not in opts.ylim:
        raise RuntimeError('ylim must be comma separated')

    ylim = map(float, opts.ylim.strip().split(','))

    print 'Loading counts data'
    data_collection1 = RegionCollection(filename=opts.collection1)
    window_size = data_collection1.info['args']['window_size']
    data_collection2 = RegionCollection(filename=opts.collection2)

    # filter both
    if opts.cutoff < 0 or opts.cutoff > 1:
        raise RuntimeError('The cutoff must be between 0 and 1')

    data_collection1 = data_collection1.filteredChebyshevUpper(opts.cutoff)
    data_collection2 = data_collection2.filteredChebyshevUpper(opts.cutoff)

    # make sure each collection consists ot the same genes
    shared_labels = set(data_collection1.labels) & \
                    set(data_collection2.labels)

    data_collection1 = data_collection1.filteredByLabel(shared_labels)
    data_collection2 = data_collection2.filteredByLabel(shared_labels)
    assert set(data_collection1.labels) == set(data_collection2.labels)

    if opts.sample_top is None:
        sample_top = data_collection1.N
    else:
        sample_top = opts.sample_top

    indices = range(sample_top)
    data_collection1 = data_collection1.take(indices)
    data_collection2 = data_collection2.take(indices)

    print 'Starting to plot'
    if opts.bgcolor == 'black':
        grid = {'color': 'w'}
        bgcolor = '0.1'
        vline_color = 'w'
    else:
        grid = {'color': 'k'}
        vline_color = 'k'
        bgcolor = '1.0'

    vline = dict(x=0,
                 linewidth=opts.vline_width,
                 linestyle=opts.vline_style,
                 color=vline_color)

    plot = PlottableSingle(height=opts.fig_height / 2.5,
                           width=opts.fig_width / 2.5,
                           bgcolor=bgcolor,
                           grid=grid,
                           ylim=ylim,
                           xlim=(-window_size, window_size),
                           xtick_space=opts.xgrid_lines,
                           ytick_space=opts.ygrid_lines,
                           xtick_interval=opts.xlabel_interval,
                           ytick_interval=opts.ylabel_interval,
                           xlabel_fontsize=opts.xfontsize,
                           ylabel_fontsize=opts.yfontsize,
                           vline=vline,
                           ioff=True)

    x = numpy.arange(-window_size, window_size)

    if opts.metric == 'Mean counts':
        stat = averaged
    else:
        data_collection1 = data_collection1.asfreqs()
        data_collection2 = data_collection2.asfreqs()
        stat = summed

    plot_sample(plot, data_collection1, stat_maker(stat, data_collection1), x,
                opts.title, opts.xlabel, opts.ylabel, 'b', opts.legend1,
                opts.plot_stderr)
    plot_sample(plot, data_collection2, stat_maker(stat, data_collection2), x,
                opts.title, opts.xlabel, opts.ylabel, 'r', opts.legend2,
                opts.plot_stderr)

    plot.legend()
    plot.show()
    if opts.plot_filename and not opts.test_run:
        plot.savefig(opts.plot_filename)
    else:
        print opts.plot_filename
Beispiel #17
0
def main():
    option_parser, opts, args =\
    parse_command_line_parameters(**script_info)

    rr = RunRecord()
    rr.addMessage('export_feature_counts', LOG_INFO, 'Chosen sample',
                  opts.sample)

    if opts.sample is None:
        rr.display()
        raise RuntimeError('No samples available')
        return

    sample_name = _samples_name(opts.sample)
    rr.addMessage('export_feature_counts', LOG_INFO, 'Chosen sample',
                  opts.sample)
    rr.addMessage('export_feature_counts', LOG_INFO,
                  'Immuno precipitated counts path', opts.IP_counts_path)
    rr.addMessage('export_feature_counts', LOG_INFO, 'Input counts path',
                  opts.IN_counts_path)
    rr.addMessage('export_feature_counts', LOG_INFO,
                  'No. of most expressed genes sampled', opts.sample_top)
    rr.addMessage('export_feature_counts', LOG_INFO, 'maximum_read_length',
                  opts.maximum_read_length)
    rr.addMessage('export_feature_counts', LOG_INFO, 'count_max_length',
                  opts.count_max_length)
    rr.addMessage('export_feature_counts', LOG_INFO, 'upstream_size',
                  opts.upstream_size)
    rr.addMessage('export_feature_counts', LOG_INFO, 'pseudo_count',
                  opts.pseudo_count)

    genes = db_query.get_ranked_expression(session,
                                           sample_name,
                                           biotype='protein_coding',
                                           rank_by='mean',
                                           test_run=opts.test_run)
    genes = genes[:opts.sample_top]
    chrom_gene_groups = grouped_by_chrom(genes)

    ip_table = get_sum_counts_table(session,
                                    chrom_gene_groups,
                                    opts.IP_counts_path,
                                    opts.maximum_read_length,
                                    opts.count_max_length,
                                    opts.upstream_size,
                                    test_run=opts.test_run)
    # IP stands for Immuno precipitated
    ip_table.Title = 'IP'
    # renamed the IP table counts header for consistency with result of
    # joining the IP and IN tables
    ip_table = ip_table.withNewHeader(['counts'], ['IP_counts'])

    in_table = get_sum_counts_table(session,
                                    chrom_gene_groups,
                                    opts.IN_counts_path,
                                    opts.maximum_read_length,
                                    opts.count_max_length,
                                    opts.upstream_size,
                                    test_run=opts.test_run)

    # IN stands for Input
    in_table.Title = 'IN'

    combined = ip_table.joined(in_table,
                               columns_self=('region_type', 'ensembl_id',
                                             'region_rank'))
    combined.Title = ''

    ratio = CalcRatio(opts.pseudo_count)
    combined = combined.withNewColumn('ratio',
                                      ratio,
                                      columns=['IP_counts', 'IN_counts'])

    if not opts.test_run:
        combined.writeToFile(opts.save_table_name, sep='\t')
        rr.addMessage('export_feature_counts', LOG_INFO, 'Wrote counts to',
                      opts.save_table_name)
    else:
        print combined[:10]

    rr.display()