Ejemplo n.º 1
0
def get_sequence_from_location(species, coords):
    """Get sequence from a genomic location in an ensembl species genome."""

    from cogent.db.ensembl import HostAccount, Genome, Compara, Species
    genome = Genome(Species=species, Release='87', account=None)
    chrom,start,end,strand = coords
    #print coords
    r = genome.getRegion(CoordName=str(chrom), Start=start,End=end,Strand=strand)
    return r.Seq
Ejemplo n.º 2
0
def get_genes_from_location(ref, coords, pad=0):
    """Get genes from a set of genome coordinates.
       pad will add n bases to either side to expand area"""

    genome = Genome(Species=ref, Release=release, account=account)
    chrom,start,end,strand = coords
    genes = list(genome.getFeatures(CoordName=chrom, Start=start-pad,
                    End=end+pad, feature_types='gene'))
    return genes
Ejemplo n.º 3
0
def main(job_no, coord_name, start, end, species, release, dir):
    start_time = time()
    if not os.path.exists(dir):
        os.makedirs(dir)
    LOGGER.log_file_path = dir + "/" + str(
        os.path.basename(__file__)) + '_' + job_no + ".log"
    LOGGER.log_args()
    LOGGER.log_message(get_file_hexdigest(__file__),
                       label="Hex digest of script.".ljust(17))
    LOGGER.log_message('Name = ' + ensembldb3.__name__ + ', version = ' +
                       ensembldb3.__version__,
                       label="Imported module".ljust(30))
    account = HostAccount(*os.environ['ENSEMBL_ACCOUNT'].split())

    ig_count, sequence_length = 0, 0
    genome = Genome(species,
                    release=release,
                    account=account,
                    pool_recycle=3600)
    gene_count = 0
    gene_intervals = list()
    genes = genome.get_features(coord_name=coord_name,
                                start=start,
                                end=end,
                                feature_types='gene')
    for gene in genes:
        if gene.location.coord_name != coord_name:
            break
        gene_count += 1
        gene_intervals.append((gene.location.start, gene.location.end))
    gene_intervals = sorted(gene_intervals, key=lambda x: x[1])
    intergenic = interval_complement(gene_intervals)
    intergenic_sequence = ""
    for ig_interval in intergenic:
        ig_count += 1
        sequence_length += ig_interval[1] - ig_interval[0]
        region = genome.get_region(coord_name=coord_name,
                                   start=ig_interval[0],
                                   end=ig_interval[1])
        intergenic_sequence = intergenic_sequence + 'XXXXXXXXXX' + str(
            region.seq)

    LOGGER.log_message(
        str(ig_count),
        label='Number of integenic intervals processed'.ljust(30))
    LOGGER.log_message(str(sequence_length), label='Sequence length'.ljust(30))

    outfile_name = dir + '/intergenic_sequence_' + species + job_no + '.pklz'
    with gzip.open(outfile_name, 'wb') as outfile:
        pickle.dump(intergenic_sequence, outfile)
    outfile = open(outfile_name, 'r')
    LOGGER.output_file(outfile.name)
    outfile.close()

    duration = time() - start_time
    LOGGER.log_message("%.2f" % (duration / 60.),
                       label="run duration (minutes)".ljust(30))
def main(job_no, coord_name, start, end, species, release, folder):
    start_time = time()
    if not os.path.exists(folder):
        os.makedirs(folder)
    LOGGER.log_file_path = folder + "/" + str(
        os.path.basename(__file__)) + '_' + job_no + ".log"
    LOGGER.log_args()
    LOGGER.log_message(get_file_hexdigest(__file__),
                       label="Hex digest of script.".ljust(17))
    LOGGER.log_message('Name = ' + ensembldb3.__name__ + ', version = ' +
                       ensembldb3.__version__,
                       label="Imported module".ljust(30))
    account = HostAccount(*os.environ['ENSEMBL_ACCOUNT'].split())

    dupl_introns, intron_count, sequence_length = 0, 0, 0
    intron_list, human_list, species_list, intron_list = list(), list(), list(
    ), list()
    genome = Genome(species,
                    release=release,
                    account=account,
                    pool_recycle=3600)
    genes = genome.get_features(coord_name=coord_name,
                                start=start,
                                end=end,
                                feature_types='gene')
    intron_sequence = 'X'
    for gene in genes:
        if gene.canonical_transcript.introns is None:
            continue
        for intron in gene.canonical_transcript.introns:
            if intron in intron_list:
                dupl_introns += 1
                continue
            intron_list.append(intron)
            intron_count += 1
            sequence_length += len(intron)
            intron_sequence = intron_sequence + 'XXXXXXXXXX' + str(intron.seq)
    outfile_name = folder + '/intronic_sequence' + species + job_no + '.pklz'
    with gzip.open(outfile_name, 'wb') as outfile:
        pickle.dump(intron_sequence, outfile)
    outfile = open(outfile_name, 'r')
    LOGGER.output_file(outfile.name)
    outfile.close()
    LOGGER.log_message(str(dupl_introns),
                       label='Number of duplicate introns rejected'.ljust(30))
    LOGGER.log_message(str(intron_count),
                       label='Number of introns processed'.ljust(30))
    LOGGER.log_message(str(sequence_length),
                       label='Total intron_length'.ljust(30))

    duration = time() - start_time
    LOGGER.log_message("%.2f" % (duration / 60.),
                       label="run duration (minutes)".ljust(30))
Ejemplo n.º 5
0
def get_genes_from_location(ref, coords, pad=0):
    """Get genes from a set of genome coordinates.
       pad will add n bases to either side to expand area"""

    genome = Genome(Species=ref, Release=release, account=account)
    chrom, start, end, strand = coords
    genes = list(
        genome.getFeatures(CoordName=chrom,
                           Start=start - pad,
                           End=end + pad,
                           feature_types='gene'))
    return genes
Ejemplo n.º 6
0
def get_sequence_from_location(species, coords):
    """Get sequence from a genomic location in an ensembl species genome."""

    from cogent.db.ensembl import HostAccount, Genome, Compara, Species
    genome = Genome(Species=species, Release='87', account=None)
    chrom, start, end, strand = coords
    #print coords
    r = genome.getRegion(CoordName=str(chrom),
                         Start=start,
                         End=end,
                         Strand=strand)
    return r.Seq
Ejemplo n.º 7
0
def main(input_directory, output, flank_size):
    args = locals()
    
    if not os.path.exists(output):
        os.makedirs(output)
        
    logfile_path = os.path.join(output, "mouse_germline.log")
    LOGGER.log_file_path = logfile_path
    LOGGER.log_message(str(args), label="vars")
    
    account = HostAccount(*os.environ['ENSEMBL_ACCOUNT'].split())
    mouse = Genome('mouse', release=88, account=account, pool_recycle=10000)
    
    input_path = os.path.abspath(input_directory)
    file_paths = get_files(input_path)
    
    for fn in file_paths:
        LOGGER.input_file(fn)
        
        gene_id = os.path.basename(fn).split('.')[0]
        
        print ("Acquiring variants from gene %s"%gene_id)
        gene = mouse.get_gene_by_stableid(stableid=gene_id)
        
        
        output_file = os.path.join(output, '%s.txt' % gene_id)
        
        start_time = time.time()
        num = 0
        with open(output_file, mode='w') as out_file:
            LOGGER.output_file(output_file)
            try:
                variants = get_var_info(gene, gene_id, flank_size)
                
                for var in variants:
                    record = var_records(str(var))
                    out_file.write('\t'.join(record) + '\n')
                    num += 1
                    
            except AssertionError:
                print('for gene %s, the translated exon and CDs are different.' % gene_id)
                os.remove(output_file)
            
            print ("finish getting variants on gene %s" % gene_id)
            LOGGER.log_message("%s"%num, label="Number of SNPs recorded")
            print ()
            
    print ()
    print ('Done')
    
    #determine runtime
    duration = time.time() - start_time
    LOGGER.log_message("%.2f" % (duration / 60.), label="run duration (minutes)")
Ejemplo n.º 8
0
def main(input_dir, output_datafile, flank_size, chroms, coord_range):
    args = locals()
    
    output_dir = os.path.dirname(output_datafile)
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
        
    logfile_path = os.path.join(output_dir, "logs/sample_ENU.log")
    LOGGER.log_file_path = logfile_path
    LOGGER.log_message(str(args), label="vars")
    
    account = HostAccount(*os.environ['ENSEMBL_ACCOUNT'].split())
    mouse = Genome('mouse', release=88, account=account, pool_recycle=10000)
    
    input_dir = os.path.abspath(input_dir)
    file_paths = get_files(input_dir)
    
    start_time = time.time()
    for fn in file_paths:
        with open(fn, mode='r') as input_file:
            LOGGER.input_file(fn)
            with open(output_datafile, mode='w') as output:
                LOGGER.output_file(output_datafile)
                first_line = input_file.readline()
                
                num = 0   
                for line in input_file:
                    records = line.split('|')
                    print('Variant %s' % records[0])
                    if not get_ENU_data(records, chroms, coord_range):###
                        continue
                    
                    var_id, chromosome, coordinate, ref_base, var_base, effect = get_ENU_data(records, chroms, coord_range)###
                    
                    record = get_snp_data(var_id, chromosome, coordinate, ref_base, var_base, effect, mouse, int(flank_size))

                    if not record:
                        continue
                    
                    output.write('\t'.join(record)+'\n')
                    num += 1
        
        print("num written", num)
        output.close()
    
    #determine runtime
    duration = time.time() - start_time
    LOGGER.log_message("%.2f" % (duration / 60.), label="run duration (minutes)")        
Ejemplo n.º 9
0
def dump_genes(ensembl_account, species, outpath, coord_names, release, limit):
    """Dump meta data table for genes from one species in release ENSEMBL_ACCOUNT
    and exits."""
    ensembl_account = _get_account(ensembl_account)
    if len(species) > 1:
        msg = "dump_genes handles single species only"
        click.secho(msg, fg="red")
        sys.exit(-1)

    missing_species = missing_species_names(species)
    if missing_species:
        msg = [
            "The following species names don't match an Ensembl record. "
            "Check spelling!",
            str(missing_species),
            "\nAvailable species are at this server are:",
            str(display_available_dbs(ensembl_account)),
        ]

        click.secho("\n".join(msg), fg="red")
        sys.exit(-1)

    if coord_names:
        chroms = load_coord_names(coord_names)
    else:
        chroms = None

    genome = Genome(species[0], release=release, account=ensembl_account)
    genes = _get_ref_genes(genome, chroms, limit)
    records = []
    for g in genes:
        records.append([g.stableid, g.biotype, g.location, g.description])

    if records:
        table = make_table(
            header=["stableid", "biotype", "location", "description"], rows=records
        )
        table.write(outpath)
        click.secho("Wrote %d genes to %s" % (table.shape[0], outpath), fg="green")
    else:
        click.secho("No genes matching criteria", fg="blue")
def main(job_no, coord_name, start, end, species, release, var_set_id, dir):
    start_time = time()
    if not os.path.exists(dir):
        os.makedirs(dir)
    LOGGER.log_file_path = dir + "/" + str(
        os.path.basename(__file__)) + '_' + job_no + ".log"
    LOGGER.log_args()
    LOGGER.log_message(get_file_hexdigest(__file__),
                       label="Hex digest of script.".ljust(17))
    LOGGER.log_message('Name = ' + ensembldb3.__name__ + ', version = ' +
                       ensembldb3.__version__,
                       label="Imported module".ljust(30))
    LOGGER.log_message('Name = ' + sqlalchemy.__name__ + ', version = ' +
                       sqlalchemy.__version__,
                       label="Imported module".ljust(30))
    account = HostAccount(*os.environ['ENSEMBL_ACCOUNT'].split())

    genome = Genome(species,
                    release=release,
                    account=account,
                    pool_recycle=3600)
    confirm_variation_set(genome, var_set_id)
    var_locations = get_variant_details(genome, coord_name, start, end)
    LOGGER.log_message(str(len(var_locations)),
                       label='Length of var_locations list'.ljust(30))

    outfile_name = dir + '/intergenic_variants_' + species + '_' + job_no + '.pklz'
    with gzip.open(outfile_name, 'wb') as outfile:
        pickle.dump(var_locations, outfile)
    outfile = open(outfile_name, 'r')
    LOGGER.output_file(outfile.name)
    outfile.close()

    duration = time() - start_time
    LOGGER.log_message("%.2f" % (duration / 60.),
                       label="run duration (minutes)".ljust(30))
Ejemplo n.º 11
0
def main_core(job_no, species, varfile_name=None, intronfile_name=None, release=89, n_jobs=5, dir='data'):
    global genome
    if not os.path.exists(dir):
        os.makedirs(dir)
    LOGGER.log_file_path = dir + "/" + str(os.path.basename(__file__)) + '_' + job_no + ".log"
    start_time = time()
    LOGGER.log_args()
    LOGGER.log_message(get_file_hexdigest(__file__), label="Hex digest of script.".ljust(25))
    LOGGER.log_message('Name = ' + numpy.__name__ + ', version = ' + numpy.__version__,
                       label="Imported module".ljust(25))
    LOGGER.log_message('Name = ' + cogent3.__name__ + ', version = ' + cogent3.__version__,
                       label="Imported module".ljust(25))
    LOGGER.log_message('Name = ' + ensembldb3.__name__ + ', version = ' + ensembldb3.__version__,
                       label="Imported module".ljust(25))
    account = HostAccount(*os.environ['ENSEMBL_ACCOUNT'].split())
    genome = Genome(species, release=release, account=account, pool_recycle=3600)
    human_seq_region_dict = dict(
        {'1': 131550, '2': 131545, '3': 131551, '4': 131552, '5': 131542, '6': 131555, '7': 131559,
         '8': 131560, '9': 131540, '10': 131544, '11': 131556, '12': 131546, '13': 131541,
         '14': 131547, '15': 131558,
         '16': 131549, '17': 131554, '18': 131548, '19': 131537, '20': 131538, '21': 131543,
         '22': 131557,
         'X': 131539, 'Y': 131553})
    chimp_seq_region_dict = dict({"21": 212405, "7": 212407, "15": 212409, "16": 212395, "1": 212403, "17": 212411,
                                  "18": 212410, "19": 212394, "20": 212404, "22": 212390, "3": 212392, "4": 212393,
                                  "5": 212391, "6": 212388, "8": 212397, "9": 212396, "10": 212387, "11": 212389,
                                  "12": 212402, "13": 212408, "14": 212401, "Y": 212406, "X": 212399})
    if species == 'human':
        coord_dict = dict([(v, k) for k, v in human_seq_region_dict.items()])
        tag = 'human'
    elif species == 'chimp':
        coord_dict = dict([(v, k) for k, v in chimp_seq_region_dict.items()])
        tag = 'spec_'
    else:
        assert False, 'Unknown species: ' + species
    if varfile_name is None:
        varfile_name = dir + '/var_locations_' + tag + job_no + '.pklz'
    infile = open(varfile_name, 'r')
    LOGGER.input_file(infile.name)
    infile.close()
    with gzip.open(varfile_name, 'rb') as var_details:
        var_details = pickle.load(var_details)
    LOGGER.log_message(str(len(var_details)), label="Number of variants read".ljust(25))

    if intronfile_name is None:
        intronfile_name = dir + '/all_locations_' + tag + job_no + '.pklz'
    infile = open(intronfile_name, 'r')
    LOGGER.input_file(infile.name)
    infile.close()
    with gzip.open(intronfile_name, 'rb') as intron_locs:
        intron_locs = pickle.load(intron_locs)
    LOGGER.log_message(str(len(intron_locs)), label="Number of introns read".ljust(25))
    with gzip.open(intronfile_name, 'rb') as intron_locs:
        intron_locs = pickle.load(intron_locs)
    var_details, var_locs_reversed = check_variant_strand(var_details, intron_locs)

#   var_details fields are: (variant name, seq region id, location, ancestral_allele, derived_allele)
    item_list = Parallel(n_jobs=n_jobs)(delayed(get_contexts) (var, coord_dict) for var in var_details)
    var_count_dict = Counter(item_list)
    del var_count_dict[None]
    outfile_name = dir + '/var_dict_' + tag + job_no + '.pklz'
    with gzip.open(outfile_name, 'wb') as outfile:
        pickle.dump(var_count_dict, outfile)
    outfile = open(outfile_name, 'r')
    LOGGER.output_file(outfile.name)
    outfile.close()
    duration = time() - start_time
    LOGGER.log_message("%.2f" % (duration / 60.), label="run duration (minutes)".ljust(25))
Ejemplo n.º 12
0
def main(job_no, coord_name, start, end, species, release, var_set_id, filter,
         dir):
    start_time = time()
    if not os.path.exists(dir):
        os.makedirs(dir)
    LOGGER.log_file_path = dir + "/" + str(
        os.path.basename(__file__)) + '_' + job_no + ".log"
    LOGGER.log_args()
    LOGGER.log_message(get_file_hexdigest(__file__),
                       label="Hex digest of script.".ljust(17))
    LOGGER.log_message('Name = ' + ensembldb3.__name__ + ', version = ' +
                       ensembldb3.__version__,
                       label="Imported module".ljust(30))
    LOGGER.log_message('Name = ' + sqlalchemy.__name__ + ', version = ' +
                       sqlalchemy.__version__,
                       label="Imported module".ljust(30))
    account = HostAccount(*os.environ['ENSEMBL_ACCOUNT'].split())

    var_locations_list, location_list = list(), list()
    dupl_introns, intron_count, bad_var_count, sequence_length = 0, 0, 0, 0
    intron_list, human_list, species_list, intron_list = list(), list(), list(
    ), list()
    genome = Genome(species,
                    release=release,
                    account=account,
                    pool_recycle=3600)
    confirm_variation_set(genome, var_set_id)
    genes = genome.get_features(coord_name=coord_name,
                                start=start,
                                end=end,
                                feature_types='gene')
    for gene in genes:
        if gene.canonical_transcript.introns is None:
            continue
        for intron in gene.canonical_transcript.introns:
            if intron in intron_list:
                dupl_introns += 1
                continue
            intron_list.append(intron)
            intron_length = len(intron)
            intron_count += 1
            sequence_length += intron_length
            loc = intron.location
            location_list.append(
                (str(loc.coord_name), loc.start, loc.end,
                 loc.strand))  # location.coord_name is db3util object
            var_locations, bad_var_num = get_variant_details(
                genome, species, intron, filter)
            var_locations_list = var_locations_list + var_locations
            bad_var_count += bad_var_num
    LOGGER.log_message(str(dupl_introns),
                       label='Number of duplicate introns rejected'.ljust(30))
    LOGGER.log_message(str(intron_count),
                       label='Number of introns processed'.ljust(30))
    if species == 'human':
        LOGGER.log_message(str(bad_var_count),
                           label='Number of rejected variants'.ljust(30))
    LOGGER.log_message(str(sequence_length), label='Sequence length'.ljust(30))
    LOGGER.log_message(str(len(var_locations_list)),
                       label='Length of var_locations list'.ljust(30))
    LOGGER.log_message(str(len(var_locations_list) / sequence_length),
                       label='Average SNV rate'.ljust(30))

    outfile_name = dir + '/var_locations_' + species + job_no + '.pklz'
    with gzip.open(outfile_name, 'wb') as outfile:
        pickle.dump(var_locations_list, outfile)
    outfile = open(outfile_name, 'r')
    LOGGER.output_file(outfile.name)
    outfile.close()

    outfile_name = dir + '/all_locations_' + species + job_no + '.pklz'
    with gzip.open(outfile_name, 'wb') as outfile:
        pickle.dump(location_list, outfile)
    outfile = open(outfile_name, 'r')
    LOGGER.output_file(outfile.name)
    outfile.close()

    duration = time() - start_time
    LOGGER.log_message("%.2f" % (duration / 60.),
                       label="run duration (minutes)".ljust(30))
Ejemplo n.º 13
0
def one2one(
    ensembl_account,
    species,
    release,
    outdir,
    ref,
    ref_genes_file,
    coord_names,
    not_strict,
    introns,
    method_clade_id,
    mask_features,
    logfile_name,
    limit,
    force_overwrite,
    test,
):
    """Command line tool for sampling homologous sequences from Ensembl."""
    outdir = abspath(outdir)
    if not any([ref, ref_genes_file]):
        # just the command name, indicate they need to display help
        click.secho("Missing 'ref' and 'ref_genes_file'")
        ctx = click.get_current_context()
        msg = "%s\n\n--help to see all options\n" % ctx.get_usage()
        click.echo(msg)
        exit(-1)

    ensembl_account = _get_account(ensembl_account)
    args = locals()
    args["ensembl_account"] = str(ensembl_account)
    LOGGER.log_message(str(args), label="params")

    if test and limit == 0:
        limit = 2
    else:
        limit = limit or None

    if (introns and not method_clade_id) or (mask_features and not introns):
        msg = [
            "Must specify the introns and method_clade_id in order to",
            "export introns. Use show_align_methods to see the options",
        ]
        click.secho("\n".join(msg), fg="red")
        exit(-1)

    species_missing = missing_species_names(species)
    if species_missing:
        msg = [
            "The following species names don't match an Ensembl record."
            " Check spelling!",
            str(species_missing),
            "\nAvailable species are at this server are:",
            str(display_available_dbs(ensembl_account)),
        ]

        click.secho("\n".join(msg), fg="red")
        exit(-1)

    if ref:
        ref = ref.lower()

    if ref and ref not in species:
        print("The reference species not in species names")
        exit(-1)

    compara = Compara(species, release=release, account=ensembl_account)
    runlog_path = os.path.join(outdir, logfile_name)

    if os.path.exists(runlog_path) and not force_overwrite:
        msg = [
            "Log file (%s) already exists!" % runlog_path,
            "Use force_overwrite or provide logfile_name",
        ]
        click.secho("\n".join(msg), fg="red")
        exit(-1)

    if not test:
        LOGGER.log_file_path = runlog_path

    chroms = None
    if coord_names:
        chroms = load_coord_names(coord_names)
        LOGGER.input_file(coord_names)
    elif coord_names and ref:
        chroms = get_chrom_names(ref, compara)

    if not os.path.exists(outdir) and not test:
        os.makedirs(outdir)
        print("Created", outdir)

    if ref and not ref_genes_file:
        ref_genome = Genome(ref, release=release, account=ensembl_account)
        ref_genes = [g.stableid for g in _get_ref_genes(ref_genome, chroms, limit)]
    else:
        if not (ref_genes_file.endswith(".csv") or ref_genes_file.endswith(".tsv")):
            msg = (
                "ref_genes_file must be either a comma/tab "
                "delimted with the corresponding suffix (.csv/.tsv)"
            )
            click.secho(msg, fg="red")
            exit(-1)

        ref_genes = load_table(ref_genes_file)
        if "stableid" not in ref_genes.header:
            msg = "ref_genes_file does not have a 'stableid' column header"
            click.secho(msg, fg="red")
            exit(-1)

        ref_genes = ref_genes.tolist("stableid")

    if limit:
        ref_genes = ref_genes[:limit]

    if not introns:
        print("Getting orthologs %d genes" % len(ref_genes))
        get_one2one_orthologs(
            compara, ref_genes, outdir, not_strict, force_overwrite, test
        )
    else:
        print("Getting orthologous introns for %d genes" % len(ref_genes))
        get_syntenic_alignments_introns(
            compara,
            ref_genes,
            outdir,
            method_clade_id,
            mask_features,
            outdir,
            force_overwrite,
            test,
        )
def main(job_no, chrom, sex, species, release, dir):
    start_time = time()
    if not os.path.exists(dir):
        os.makedirs(dir)
    LOGGER.log_file_path = dir + "/" + str(
        os.path.basename(__file__)) + job_no + ".log"
    LOGGER.log_args()
    LOGGER.log_message(get_file_hexdigest(__file__),
                       label="Hex digest of script.".ljust(17))
    LOGGER.log_message('Name = ' + ensembldb3.__name__ + ', version = ' +
                       ensembldb3.__version__,
                       label="Imported module".ljust(30))
    LOGGER.log_message('Name = ' + sqlalchemy.__name__ + ', version = ' +
                       sqlalchemy.__version__,
                       label="Imported module".ljust(30))
    human_seq_region_dict = dict({
        '1': 131550,
        '2': 131545,
        '3': 131551,
        '4': 131552,
        '5': 131542,
        '6': 131555,
        '7': 131559,
        '8': 131560,
        '9': 131540,
        '10': 131544,
        '11': 131556,
        '12': 131546,
        '13': 131541,
        '14': 131547,
        '15': 131558,
        '16': 131549,
        '17': 131554,
        '18': 131548,
        '19': 131537,
        '20': 131538,
        '21': 131543,
        '22': 131557,
        'X': 131539,
        'Y': 131553
    })
    account = HostAccount(*os.environ['ENSEMBL_ACCOUNT'].split())
    genome = Genome(species,
                    release=release,
                    account=account,
                    pool_recycle=3600)
    variation_table = genome.VarDb.get_table('variation')
    variation_feature_table = genome.VarDb.get_table('variation_feature')
    var_table = variation_table.join(
        variation_feature_table, variation_feature_table.c.variation_id ==
        variation_table.c.variation_id)

    seq_region_id = human_seq_region_dict[chrom]

    file_name = sex + '_noncarrier-hg38.csv'
    infile = open(file_name, 'r')
    LOGGER.input_file(infile.name)
    infile.close()
    recombination_df = pd.read_csv(file_name, usecols=[0, 1, 2, 3, 4])
    recomb_df = recombination_df.loc[lambda df: df.chr == 'chr' + chrom, :]
    recomb_df = recomb_df.reset_index(drop=True)

    mut_profiles = [
        i[0] + '->' + i[1] for i in permutations(['C', 'T', 'A', 'G'], 2)
    ]
    counts = np.zeros((recomb_df.shape[0], 21))
    counts = pd.DataFrame(counts,
                          columns=mut_profiles +
                          ['C', 'T', 'A', 'G', 'SW', 'WS', 'SS', 'WW', 'CpG'])
    for index, row in recomb_df.iterrows():
        midpoint = row.loc['pos38']
        region = genome.get_region(coord_name=chrom,
                                   start=midpoint - 5000,
                                   end=midpoint + 5000,
                                   ensembl_coord=True)
        region = str(region.seq)
        whereclause1 = and_(
            var_table.c.variation_feature_seq_region_id == seq_region_id,
            var_table.c.variation_feature_class_attrib_id == 2,
            var_table.c.variation_feature_evidence_attribs.contains('370'),
            var_table.c.variation_feature_variation_name.contains('rs'),
            var_table.c.variation_feature_somatic == 0,
            var_table.c.variation_feature_alignment_quality ==
            decimal.Decimal(1),
            var_table.c.variation_feature_minor_allele_freq.isnot(None),
            var_table.c.variation_feature_seq_region_start > midpoint - 5000,
            var_table.c.variation_feature_seq_region_start < midpoint + 5000)
        var_table_ed = var_table.select(whereclause1, use_labels=True)

        for snp in var_table_ed.execute():
            if snp['variation_ancestral_allele'] is None:
                continue
            else:
                ancestral_allele = snp['variation_ancestral_allele']
            alleles = snp['variation_feature_allele_string']
            if fnmatch(alleles, ancestral_allele + '/?'):
                derived_allele = alleles[2]
            elif fnmatch(alleles, '?/' + ancestral_allele):
                derived_allele = alleles[0]
            else:
                continue
            mtype = ancestral_allele + '->' + derived_allele
            counts.loc[index, mtype] += 1

            rel_loc = snp[
                'variation_feature_seq_region_start'] - midpoint + 5000
            if (region[rel_loc + 1] == 'G' and ancestral_allele == 'C' and derived_allele == 'T') or \
                    (region[rel_loc - 1] == 'C' and ancestral_allele == 'G' and derived_allele == 'A'):
                counts.loc[index, 'CpG'] += 1
            if ancestral_allele + derived_allele in ['CT', 'CA', 'GT', 'GA']:
                counts.loc[index, 'SW'] += 1
            if ancestral_allele + derived_allele in ['TC', 'AC', 'TG', 'AG']:
                counts.loc[index, 'WS'] += 1
            if ancestral_allele + derived_allele in ['CG', 'GC']:
                counts.loc[index, 'SS'] += 1
            if ancestral_allele + derived_allele in ['TA', 'AT']:
                counts.loc[index, 'WW'] += 1
        base_counts = Counter(region)
        for base in ['C', 'T', 'A', 'G']:
            counts.loc[index, base] = base_counts[base]

    results = pd.concat([recomb_df, counts], axis=1)
    csv_filename = 'recomb_table_SW_' + sex + '_ch' + chrom + '.csv'
    results.to_csv(csv_filename)
    outfile = open(csv_filename, 'r')
    LOGGER.output_file(outfile.name)
    outfile.close()

    duration = time() - start_time
    LOGGER.log_message("%.2f" % (duration / 60.),
                       label="run duration (minutes)".ljust(30))
Ejemplo n.º 15
0
def main(job_no, infile_name, release, dir):
    start_time = time()
    if not os.path.exists(dir):
        os.makedirs(dir)
    LOGGER.log_file_path = dir + "/" + str(
        os.path.basename(__file__)) + job_no + ".log"
    LOGGER.log_args()
    LOGGER.log_message(get_file_hexdigest(__file__),
                       label="Hex digest of script.".ljust(17))
    try:
        LOGGER.log_message(str(os.environ['CONDA_DEFAULT_ENV']),
                           label="Conda environment.".ljust(17))
    except KeyError:
        pass
    LOGGER.log_message('Name = ' + pd.__name__ + ', version = ' +
                       pd.__version__,
                       label="Imported module".ljust(30))
    LOGGER.log_message('Name = ' + ensembldb3.__name__ + ', version = ' +
                       ensembldb3.__version__,
                       label="Imported module".ljust(30))
    LOGGER.log_message('Name = ' + sqlalchemy.__name__ + ', version = ' +
                       sqlalchemy.__version__,
                       label="Imported module".ljust(30))
    human_seq_region_dict = dict({
        '1': 131550,
        '2': 131545,
        '3': 131551,
        '4': 131552,
        '5': 131542,
        '6': 131555,
        '7': 131559,
        '8': 131560,
        '9': 131540,
        '10': 131544,
        '11': 131556,
        '12': 131546,
        '13': 131541,
        '14': 131547,
        '15': 131558,
        '16': 131549,
        '17': 131554,
        '18': 131548,
        '19': 131537,
        '20': 131538,
        '21': 131543,
        '22': 131557,
        'X': 131539,
        'Y': 131553
    })
    account = HostAccount(*os.environ['ENSEMBL_ACCOUNT'].split())
    genome = Genome('human',
                    release=release,
                    account=account,
                    pool_recycle=3600)

    variation_feature_table = genome.VarDb.get_table('variation_feature')
    id_1KG = set([str(x) for x in range(42, 55)])
    var_details = pd.read_csv(infile_name, sep=',', index_col=0)
    infile = open(infile_name, 'r')
    LOGGER.input_file(infile.name)
    infile.close()
    loc_count, match_count, count1KG, derived_mismatch_count = 0, 0, 0, 0
    col_alleles, col_name, col_val_id = list(), list(), list()
    for row in var_details.iterrows():
        chrom = row[1].loc['chr']
        chrom = chrom[3:]
        seq_region_id = human_seq_region_dict[chrom]
        loc38 = row[1].loc['pos38']
        loc_count += 1
        whereclause1 = and_(
            variation_feature_table.c.seq_region_id == seq_region_id,
            variation_feature_table.c.seq_region_start == loc38,
            variation_feature_table.c.class_attrib_id == 2,
            variation_feature_table.c.variation_name.contains("rs"),
            variation_feature_table.c.somatic == 0,
            variation_feature_table.c.alignment_quality == decimal.Decimal(1),
            variation_feature_table.c.minor_allele_freq.isnot(None))
        query = select([
            variation_feature_table.c.variation_name,
            variation_feature_table.c.allele_string,
            variation_feature_table.c.variation_set_id
        ], whereclause1)
        snps = list(query.execute())

        if len(snps) > 0:
            if len(snps) > 1:
                print('More than one SNP at ', chrom, ':', loc38)
            alleles = snps[0][1]
            name = snps[0][0]
            match_count += 1
            if len(set(snps[0][2]).intersection(id_1KG)) > 0:
                val_id = '1KG'
                count1KG += 1
            else:
                val_id = 'Other'
        else:
            val_id = 'No match'
            name = None
            alleles = None
        col_alleles.append(alleles)
        col_name.append(name)
        col_val_id.append(val_id)
    assert var_details.shape[0] == len(col_val_id), 'Column mismatch.'
    var_details['alleles'] = pd.Series(col_alleles)
    var_details['name'] = pd.Series(col_name)
    var_details['val_id'] = pd.Series(col_val_id)
    LOGGER.log_message(str(loc_count), label='Variants read      = ')
    LOGGER.log_message(str(derived_mismatch_count),
                       label='Derived mismatches = ')
    LOGGER.log_message(str(match_count), label='Variants matched   = ')
    LOGGER.log_message(str(count1KG), label='1KG Variants       = ')
    filename = 'data/dnms_from_PRJEB21300_matched_' + job_no + '.csv'
    var_details.to_csv(filename)
    outfile = open(filename, 'r')
    LOGGER.output_file(outfile.name)
    outfile.close()

    duration = time() - start_time
    LOGGER.log_message("%.2f" % (duration / 60.),
                       label="run duration (minutes)".ljust(30))