Beispiel #1
0
def by_file(inFile, uargs):
    try:
        fasta = pyfasta.Fasta(inFile)
    except pyfasta.fasta.DuplicateHeaderException:
        tmpFile = rename_fasta(inFile)
        fasta = pyfasta.Fasta(tmpFile.name)
        tmpFile.close()
    except ValueError, TypeError:
        msg = 'ERROR: Could not read file: {}'
        sys.stderr.write(msg.format(inFile) + '\n')
        return None
Beispiel #2
0
def yield_sequence_and_BlastHit(len_seq=500,
                                n_query=10,
                                min_len_hit=20,
                                max_len_hit=40):

    # Generate a random subject sequence
    with rand_fasta(len_seq=len_seq, n_seq=1) as subject:
        pyfasta_gen = pyfasta.Fasta(subject.fasta_path, flatten_inplace=True)
        seq_record = pyfasta_gen["seq_0"]
        sequence = Sequence(name="seq_0", seq_record=seq_record)

        seq_dict = {}
        # Generate random hits from the subject
        for i in range(n_query):
            start = ri(1, len_seq - max_len_hit)
            end = start + ri(min_len_hit, max_len_hit)
            seq_dict["query_{}:{}-{}".format(i, start,
                                             end)] = str(seq_record[start:end])
        with defined_fasta(seq_dict) as query:

            # Create Blast DB and perform a blast to generate a list of hits
            with Blastn(subject.fasta_path) as blastn:
                hit_list = blastn(
                    query.fasta_path,
                    task="blastn",
                    best_query_hit=True,
                )

            yield (sequence, hit_list)
Beispiel #3
0
def loadgenome_extradata_fx(fasta_handle, gff3_handle, meta):
    """
    """
    genome = pyfasta.Fasta(fasta_handle, key_fn=lambda key: key.split()[0])
    gff3 = allel.FeatureTable.from_gff3(gff3_handle)
    meta = pd.read_csv(meta, delimiter=",")
    return (genome, gff3, meta)
Beispiel #4
0
 def _contig_size_list(self, path):
     """Insert all contig sizes in a list."""
     f = pyfasta.Fasta(path)
     self.contig_sizes = []
     for keys in f.keys():
         self.contig_sizes.append(len(f[keys]))
     self.contig_sizes.sort()
Beispiel #5
0
def main(argv):
    parser = argparse.ArgumentParser(
        description='Interpret FASTA file with Stanford service.')
    requiredNamed = parser.add_argument_group('required named arguments')
    requiredNamed.add_argument('-i',
                               '--input',
                               dest='input_file',
                               help='Input file name',
                               required=True)
    requiredNamed.add_argument('-q',
                               '--query',
                               dest='graphQL_query_file',
                               help='GraphQL query file',
                               required=True)
    #parser.parse_args(['-h'])
    results = parser.parse_args(argv)
    input_file = results.input_file
    graphQL_query_file = results.graphQL_query_file

    print generateCSVHeader()

    f = pyfasta.Fasta(input_file)

    nrLoops = int(len(f.keys()) / 1000)

    # Per 1000 sequences, do a Stanford analysis
    for i in range(0, nrLoops):
        headers = list(f.keys()[j] for j in range(i * 1000, (i * 1000) + 1000))
        pool = mp.Pool(processes=20)
        results = [
            pool.apply_async(doStanfordAnalysis,
                             args=(
                                 header,
                                 f[header],
                                 graphQL_query_file,
                             )) for header in headers
        ]
        #tmp = open(output_file, 'a')
        for p in results:
            print p.get()
        #    tmp.write(p.get())
        #tmp.close()

    #print nrLoops*1000 + ((nrLoops + len(f.keys())) % 1000)
    # Do the Stanford analysis for the last sequences available
    headers = list(f.keys()[j]
                   for j in range(nrLoops * 1000, (nrLoops * 1000 +
                                                   (len(f.keys()) % 1000))))
    pool = mp.Pool(processes=20)
    results = [
        pool.apply_async(doStanfordAnalysis,
                         args=(
                             header,
                             f[header],
                             graphQL_query_file,
                         )) for header in headers
    ]
    #tmp = open(output_file, 'a')
    for p in results:
        print p.get()
Beispiel #6
0
def explainFL(genomeFile, outPrefix, sam):
    genome = pyfasta.Fasta(genomeFile)
    samfile = pysam.AlignmentFile(sam, "r")
    fout = open(outPrefix + 'explainFL.txt', 'w')
    fout_nop = open(outPrefix + 'explainFL_noprimary.txt', 'w')
    for read in samfile.fetch():
        #if read.mapping_quality < 20:
        #   continue
        if len(read.cigar) == 0:
            continue
        readInfo = getReadInfo(read)
        leftSeq = genome.sequence({
            'chr': readInfo[1],
            'start': readInfo[2] - 1,
            'stop': readInfo[2]
        }).upper()
        rightSeq = genome.sequence({
            'chr': readInfo[1],
            'start': readInfo[3] + 1,
            'stop': readInfo[3] + 2
        }).upper()
        readInfo.append(leftSeq)
        readInfo.append(rightSeq)
        if read.flag & 256 > 0:
            fout_nop.write('\t'.join([str(i) for i in readInfo]) + "\n")
        else:
            fout.write('\t'.join([str(i) for i in readInfo]) + "\n")
    samfile.close()
    fout.close()
    fout_nop.close()
    def generate_index(self):
        print("Generating samtools index...")
        sys.stdout.flush()
        try:
            subprocess.check_call(['samtools', 'faidx', self.fasta_out])
        except subprocess.CalledProcessError:
            sys.exit("Invalid genome fasta input, please check the source.")

        print("done\n")

        # for custom references, validate fasta contig names match definition in contig_defs:
        # PRIMARY_CONTIGS must be a subset of contig names in genome.fa
        if self.contig_defs['reference_name'] not in STANDARD_GENOMES:
            regtools.validate_contig_names(self.fasta_out + '.fai',
                                           self.contig_defs)

        print("Generating pyfasta indexes...")
        sys.stdout.flush()
        pyf = pyfasta.Fasta(self.fasta_out, key_fn=lambda x: x.split()[0])
        contigs = len(pyf)
        size = sum(len(pyf[contig]) for contig in pyf)

        print("    Number of contigs: %d\n    Total genome size: %d" %
              (contigs, size))
        print("done\n")
Beispiel #8
0
def get_dn_ds_from_fasta(input_fasta, output_prefix):
    try:
        os.mkdir(output_prefix)
    except:
        pass
    fasta_in = pyfasta.Fasta(input_fasta)
    genes = list(fasta_in.keys())
    output_dn_ds = OrderedDict()
    if os.path.basename(input_fasta).startswith("N"):
        if "permissive" in input_fasta:
            output_file = os.path.join(output_prefix, os.path.basename(input_fasta).split(".permissive.fasta")[0] + ".permissive.dn_ds")
        else:
            output_file = os.path.join(output_prefix, os.path.basename(input_fasta).split(".strict.fasta")[0] + ".strict.dn_ds")
    else:
        output_file = os.path.join(output_prefix, os.path.basename(input_fasta).split(".fasta")[0] + ".dn_ds")
    if os.path.exists(output_file): 
        with open(output_file) as out_f:
            for line in out_f:
                line_s = line.split("\t")
                last_gene = line_s[0]
        idx = genes.index(last_gene)
    else:
    # Do the whole thing
        idx = 0
    with open(output_file, "w") as out_f:
        #with open(output_file) as out_f_old:
        #    for line in out_f_old:
        #        out_f.write(line)
        for gene in genes:
            out_ds =  get_dn_ds_from_alignment(input_fasta,these_samples=[gene],do_window=True,gene_name=gene,cbs_reference=False,window=200,step=10, hoffman=True)
            if out_ds is not None:
                rows = out_ds 
                out_f.write(str(gene) + "\tOVERALL\t" + str(rows[0][0]) + "\t" + str(rows[0][1]) + "\n")
                for row in rows[1][gene]:
                    out_f.write(str(gene) + "\tWINDOW\t" + str(row[0]) + "\t" + str(row[1]) + "\n")
def test_Interval_sequence():
    genome = pyfasta.Fasta('test/example.fa')
    l1 = Interval.from_string('1:858-967:1', genome=genome)
    l2 = Interval.from_string('1:858-967:-1', genome=genome)
    print l1.sequence
    print l2.sequence
    assert l1.sequence != l2.sequence
Beispiel #10
0
def liftover(args):
    """
    %prog liftover lobstr_v3.0.2_hg38_ref.bed hg38.upper.fa

    LiftOver CODIS/Y-STR markers.
    """
    p = OptionParser(liftover.__doc__)
    p.add_option("--checkvalid",
                 default=False,
                 action="store_true",
                 help="Check minscore, period and length")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    refbed, fastafile = args
    genome = pyfasta.Fasta(fastafile)
    edits = []
    fp = open(refbed)
    for i, row in enumerate(fp):
        s = STRLine(row)
        seq = genome[s.seqid][s.start - 1:s.end].upper()
        s.motif = get_motif(seq, len(s.motif))
        s.fix_counts(seq)
        if opts.checkvalid and not s.is_valid():
            continue
        edits.append(s)
        if i % 10000 == 0:
            print(i, "lines read", file=sys.stderr)

    edits = natsorted(edits, key=lambda x: (x.seqid, x.start))
    for e in edits:
        print(str(e))
Beispiel #11
0
 def _contig_size_dict(self, path):
     """Find the distribution of contig sizes."""
     f = pyfasta.Fasta(path)
     self.contig_size_dict = {}
     for keys in f.keys():
         self.contig_size_dict[keys] = len(f[keys])
     self.contig_size_dict
def count_bases_in_peaks(reference_path, peaks_file):
    """Count the total number of bases in peak regions (0-indexed)"""
    bases_in_peaks = 0
    ctg_mgr = ReferenceManager(reference_path)
    genome_fa = pyfasta.Fasta(ctg_mgr.fasta, key_fn=lambda x: x.split()[0])
    for peak in peak_reader(peaks_file):
        bases_in_peaks += len(genome_fa[peak.chrom][peak.start:peak.end])
    return bases_in_peaks
def get_barcode_gc(ref_f, peaks_f, matrix):
    """Get mean GC% of peaks in a barcode"""
    ref_mgr = ReferenceManager(ref_f)
    genome_fa = pyfasta.Fasta(ref_mgr.fasta, key_fn=lambda x: x.split()[0])
    peak_GC = np.array([get_peak_GC_counts(peak, genome_fa, counts=False)
                        for peak in peak_reader(peaks_f)])
    barcode_GC = ((peak_GC * matrix.m) / np.array(matrix.m.sum(axis=0))).squeeze()
    return barcode_GC
    def __init__(self, fasta, motifs_input, bg=None):
        self.all_motifs = []
        with open(motifs_input, "r") as infile:
            self.all_motifs = list(motifs.parse(infile, "jaspar"))

        # for large sequence header, only keep the text before the first space
        self.genome_seq = pyfasta.Fasta(fasta, key_fn=lambda x: x.split()[0])
        self.bg = bg
Beispiel #15
0
 def _get_NrContigs(self, path):
     """Find the number of contigs in the fasta file."""
     try:
         f = pyfasta.Fasta(path)
         self.nrContigs = len(f)
         print("Contigs: " + str(self.nrContigs))
     except ValueError:
         self.nrContigs = 0
Beispiel #16
0
 def _get_NrContigs(self, path):
     """Find the number of contigs in the fasta file."""
     f = pyfasta.Fasta(path)
     counter = 0
     for header in f:
         counter += 1
     self.nrContigs = counter
     print("Contigs: " + str(self.nrContigs))
Beispiel #17
0
def hints_db(hints_args, toil_options):
    """
    Entry point for hints database Toil pipeline.
    """
    def validate_import_bam(t, bam_path, fasta_sequences, genome):
        validate_bam_fasta_pairs(bam_path, fasta_sequences, genome)
        return [FileID.forPath(t.importFile('file://' + bam_path), bam_path),
                FileID.forPath(t.importFile('file://' + bam_path + '.bai'), bam_path + '.bai')]

    fasta = pyfasta.Fasta(hints_args.fasta)
    fasta_sequences = {(x.split()[0], len(fasta[x])) for x in fasta.keys()}
    with Toil(toil_options) as t:
        if not t.options.restart:
            # load the RNA-seq data, if we have any
            bam_file_ids = {'BAM': {}, 'INTRONBAM': {}}
            for dtype in ['BAM', 'INTRONBAM']:
                if hints_args.genome not in hints_args.cfg[dtype]:
                    continue
                for bam_path in hints_args.cfg[dtype][hints_args.genome]:
                    bam_file_ids[dtype][os.path.basename(bam_path)] = validate_import_bam(t, bam_path,
                                                                                          fasta_sequences,
                                                                                          hints_args.genome)

            # load the IsoSeq data, if we have any
            iso_seq_file_ids = []
            if hints_args.genome in hints_args.cfg['ISO_SEQ_BAM']:
                for bam_path in hints_args.cfg['ISO_SEQ_BAM'][hints_args.genome]:
                    validate_bam_fasta_pairs(bam_path, fasta_sequences, hints_args.genome)
                    iso_seq_file_ids.append(validate_import_bam(t, bam_path, fasta_sequences, hints_args.genome))

            if hints_args.annotation_gp is None:
                annotation_file_id = None
            else:
                annotation_file_id = FileID.forPath(t.importFile('file://' + hints_args.annotation_gp),
                                                    hints_args.annotation_gp)
            if hints_args.protein_fasta is None:
                protein_fasta_file_id = genome_fasta_file_id = None
            else:
                protein_fasta_file_id = FileID.forPath(t.importFile('file://' + hints_args.protein_fasta),
                                                       hints_args.protein_fasta)
                genome_fasta_file_id = FileID.forPath(t.importFile('file://' + hints_args.fasta), hints_args.fasta)

            input_file_ids = {'bams': bam_file_ids,
                              'iso_seq_bams': iso_seq_file_ids,
                              'annotation': annotation_file_id,
                              'protein_fasta': protein_fasta_file_id,
                              'genome_fasta': genome_fasta_file_id}
            if len(input_file_ids['bams']) + len(input_file_ids['iso_seq_bams']) > 0:
                logger.info('All BAMs validated for {}. Beginning Toil hints pipeline'.format(hints_args.genome))

            disk_usage = tools.toilInterface.find_total_disk_usage(input_file_ids)
            job = Job.wrapJobFn(setup_hints, input_file_ids, disk=disk_usage)
            combined_hints = t.start(job)
        else:
            logger.info('Restarting Toil hints pipeline for {}.'.format(hints_args.genome))
            combined_hints = t.restart()
        tools.fileOps.ensure_file_dir(hints_args.hints_path)
        t.exportFile(combined_hints, 'file://' + hints_args.hints_path)
Beispiel #18
0
def lobstrindex(args):
    """
    %prog lobstrindex hg38.trf.bed hg38.upper.fa hg38

    Make lobSTR index. Make sure the FASTA contain only upper case (so use
    fasta.format --upper to convert from UCSC fasta). The bed file is generated
    by str().
    """
    p = OptionParser(lobstrindex.__doc__)
    p.add_option("--fixseq",
                 action="store_true",
                 default=False,
                 help="Scan sequences to extract perfect STRs")
    p.set_home("lobstr")
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    trfbed, fastafile, pf = args
    lhome = opts.lobstr_home
    mkdir(pf)

    if opts.fixseq:
        genome = pyfasta.Fasta(fastafile)
        newbedfile = trfbed + ".new"
        newbed = open(newbedfile, "w")
        fp = open(trfbed)
        retained = total = 0
        for row in fp:
            s = STRLine(row)
            total += 1
            for ns in s.iter_exact_str(genome):
                if not ns.is_valid():
                    continue
                print >> newbed, ns
                retained += 1
        newbed.close()
        logging.debug("Retained: {0}".format(percentage(retained, total)))
    else:
        newbedfile = trfbed

    mm = MakeManager()
    cmd = "python {0}/scripts/lobstr_index.py".format(lhome)
    cmd += " --str {0} --ref {1} --out {2}".format(newbedfile, fastafile, pf)
    mm.add((newbedfile, fastafile), op.join(pf, "lobSTR_ref.fasta.rsa"), cmd)

    tabfile = "{0}/index.tab".format(pf)
    cmd = "python {0}/scripts/GetSTRInfo.py".format(lhome)
    cmd += " {0} {1} > {2}".format(newbedfile, fastafile, tabfile)
    mm.add((newbedfile, fastafile), tabfile, cmd)

    infofile = "{0}/index.info".format(pf)
    cmd = "cp {0} {1}".format(trfbed, infofile)
    mm.add(trfbed, infofile, cmd)
    mm.write()
Beispiel #19
0
    def __init__(self, gtf_file, fasta_file):
        self.gtf_file = gtf_file

        def map_key(key):
            return key.split(' ')[0]

        self.fa = pyfasta.Fasta(fasta_file)
        self.fasta_file = fasta_file
        self.mapkeys = dict()
        for k in self.fa.keys():
            self.mapkeys[map_key(k)] = k
Beispiel #20
0
    def __init__ (self, name, fasta, compress=True):
        """
        Create a reference object extract fasta ref if needed and create a sequence object per
        sequences found in the fasta file
        @param name     Name of the Reference
        @param fasta    Path to a fasta file (can be gzipped)
        @param compress Fasta output will be gzipped if True
        """
        print(("Create {} object".format(name)))
        # Create self variables
        self.name = name
        self.temp_dir = mkdtemp()
        self.compress = compress

        # Create a name for the fasta file to be generated
        self.modified_fasta = "{}_masked.fa{}".format(self.name, ".gz" if self.compress else "")

        try:
            # Test values
            assert self.name not in self.REFERENCE_NAMES, "Reference name <{}> is duplicated".format(self.name)
            assert is_readable_file(fasta), "{} is not a valid file".format(fasta)

            # If gziped, ungzip the reference fasta file in the temporary folder. If not compress
            # copy in the temporary folder

            if is_gziped(fasta):
                print (" * Unzip fasta file in a temporary directory")
                self.fasta = gunzip(fasta, self.temp_dir)
            else:
                print (" * Copy fasta file in a temporary directory")
                self.fasta = cp(fasta, self.temp_dir)

            # Loading the fasta sequence in a pyfasta.Fasta (seq_record is a mapping)
            print (" * Parsing the file with pyfasta")
            seq_dict = {}
            fasta_record = pyfasta.Fasta(self.fasta, flatten_inplace=True)
            print((" * Found {} sequences in {}".format (len (fasta_record), self.name)))

            for name, seq_record in list(fasta_record.items()):

                # Remove additional sequence descriptor in fasta header and create a Sequence object
                short_name = name.partition(" ")[0]
                assert short_name not in seq_dict, "Reference name <{}> is duplicated in <{}>".format(short_name,self.name)
                seq_dict[short_name] = Sequence(name=short_name, seq_record=seq_record)

            # Save to a name sorted ordered dict
            self.seq_dict = OrderedDict(sorted(list(seq_dict.items()), key=lambda x: x))

            # Add name to a class list
            self.ADD_TO_REFERENCE_NAMES(self.name)

        except Exception as E:
            self.clean()
            raise E
Beispiel #21
0
    def __init__(self, ref_path, bg=None):
        ref_manager = ReferenceManager(ref_path)
        self.all_motifs = []
        if ref_manager.motifs is not None:
            with open(ref_manager.motifs, "r") as infile:
                self.all_motifs = list(motifs.parse(infile, "jaspar"))

        # for large sequence header, only keep the text before the first space
        self.genome_seq = pyfasta.Fasta(ref_manager.fasta,
                                        key_fn=lambda x: x.split()[0])
        self.bg = bg
Beispiel #22
0
def open_reference(reference_path):
    ''' Open a reference fasta and rename the contigs to strip any fasta comments'''
    fasta = pyfasta.Fasta(get_fasta(reference_path))

    new_fasta = {}

    for (k,v) in fasta.iteritems():
        key_prefix = k.split(" ")[0]
        new_fasta[key_prefix] = v

    return new_fasta
Beispiel #23
0
def circSeq(genomeFile,outPrefix,thread):
    global genome,FL
    genome=pyfasta.Fasta(genomeFile)
    FL=pd.read_csv(outPrefix+'constructFL_Normal_adj.txt',sep='\t',dtype={'exon_start':str,'exon_end':str})
    fout=open(outPrefix+'circSeq.fa','w')
    pool=Pool(processes=thread)
    seq=pool.map(getSeq,range(FL.shape[0]))
    pool.close()
    pool.join()
    for i in range(len(seq)):
        fout.write('>'+seq[i][0]+'\n'+seq[i][1]+'\n')
    fout.close()
    tidehunter(outPrefix+'circSeq.fa',outPrefix+'circSeq.th',thread)
    
def main(opts):
    """
    main function
    :param opts: input parameters
    :return: file containing flanking sequences with mutations as iupac nucleotides, and vcf file
    with the selected sequences.
    """
    # input files
    vcf_reader = vcf.Reader(filename=opts.vcf_in)  # read in raw vcf
    reference = pyfasta.Fasta(opts.reference)  # read in reference genome
    with open(opts.snp_set,
              'r') as f:  # open selected markers but only first line
        first_line = f.readline()

    # output files
    primer_seq = open(opts.fasta_out, "w")  # output iupack nucleotide file

    # output vcf with only the selected sequences from raw vcf file. (contains all the info)
    writer_willem = vcf.Writer(open(opts.vcf_out, 'w'),
                               vcf_reader,
                               lineterminator='\n')

    # For each of the selected snps
    snp_index = first_line.split()
    for snp in snp_index:

        coordinate = snp.rsplit(".", 1)
        scaffold_len = len(str(reference[coordinate[0]]))
        start = int(coordinate[1]) - opts.length
        stop = int(coordinate[1]) + opts.length

        # if reference sequence is not long enough ajust the lengths of start and stop.
        if start < 0:
            start = 0
        if stop > scaffold_len:
            stop = scaffold_len

        snp_seq = reference[coordinate[0]][start:stop]
        snp_locs, snp_call, main_snp = get_all_snps(vcf_reader, coordinate,
                                                    start, stop, writer_willem,
                                                    opts.maf, opts.call_rate)
        new_seq = parse_sequence(coordinate[1], snp_locs, snp_seq, snp_call,
                                 main_snp, start)

        primer_seq.write(coordinate[0] + ":" + str(start) + "-" + str(stop) +
                         ":" + coordinate[1] + "\t")
        primer_seq.write(new_seq + "\n")

    primer_seq.close()
    writer_willem.close()
Beispiel #25
0
def main():
    parser = argparse.ArgumentParser(description="Extract informative sites")
    parser.add_argument("aligned_fasta")
    parser.add_argument("-v",
                        "--vcf",
                        dest="vcf",
                        help="Extract genotypes from VCF")
    args = parser.parse_args()
    fasta_r = pyfasta.Fasta(args.aligned_fasta)
    indels = None
    if args.vcf is not None:
        # We should generate a list of indels for each sample
        indels = get_indels_from_vcf(args.vcf)
    extract_informative(fasta_r, indels)
Beispiel #26
0
    def pyfasta_fasta(n):
        print('timings for pyfasta.Fasta')
        ti = []
        tf = []
        for _ in range(n):
            t = time.time()
            f = pyfasta.Fasta(fa_file.name)
            ti.append(time.time() - t)

            t = time.time()
            read_dict(f, headers)
            tf.append(time.time() - t)
            os.remove(fa_file.name + '.flat')
            os.remove(fa_file.name + '.gdx')
        # profile memory usage and report timings
        tracemalloc.start()
        f = pyfasta.Fasta(fa_file.name)
        read_dict(f, headers)
        os.remove(fa_file.name + '.flat')
        os.remove(fa_file.name + '.gdx')
        print(tracemalloc.get_traced_memory())
        print(mean(ti))
        print(mean(tf)/nreads/10*1000*1000)
        tracemalloc.stop()
    def get_generator(self, loop_infinitely):
        #read bed_source into memory
        bed_fh = fp.get_file_handle(self.bed_source)
        data = []
        print("Reading bed file " + self.bed_source + " into memory")

        for a_row in bed_fh:
            a_row = a_row.rstrip().split("\t")
            data.append(
                Interval(chrom=a_row[0],
                         start=int(a_row[1]),
                         stop=int(a_row[2]),
                         labels=[self.labels_dtype(x) for x in a_row[3:]]))
        print("Finished reading bed file into memory; got " + str(len(data)) +
              "rows")
        if (self.num_to_load_for_eval > len(data)):
            print("num_to_load_for_eval is " + str(self.num_to_load_for_eval) +
                  " but length of data is " + str(len(data)) + "; adjusting")
            self.num_to_load_for_eval = len(data)
        random_obj = np.random.RandomState(self.random_seed)
        if (self.randomize_after_pass):
            data = shuffle_array(arr=data, random_obj=random_obj)

        #fasta extraction
        import pyfasta
        f = pyfasta.Fasta(self.fasta_data_source)

        idx = 0
        while (idx < len(data)):

            to_extract = data[idx:idx + 1]
            if (idx % 1000 == 0):
                print(to_extract)
            to_yield = f[
                to_extract[0].chrom][to_extract[0].start:to_extract[0].stop]
            to_yield = np.array([one_hot_encode[x] for x in to_yield])
            yield (to_yield, to_extract[0].labels,
                   (to_extract[0].chrom, to_extract[0].start,
                    to_extract[0].stop))

            idx += 1
            if (idx == len(data)):
                if (loop_infinitely):
                    if (self.randomize_after_pass):
                        data = shuffle_array(arr=data, random_obj=random_obj)
                    idx = 0
                else:
                    raise StopIteration()
Beispiel #28
0
    def __init__(self, fasta_path, gff3_path, seqid=None):
        """
        An annotated reference genome.

        Parameters
        ----------

        fasta_path : string
            Path to reference genome FASTA file.
        gff3_path : string
            Path to genome annotations GFF3 file.

        """

        # store initialisation parameters
        self._fasta_path = fasta_path
        self._gff3_path = gff3_path
        self._seqid = seqid

        # setup access to reference sequence
        self._fasta = pyfasta.Fasta(fasta_path)

        # setup access to GFF3 as a table
        if isinstance(gff3_path, (list, tuple)):
            tbl_features = etl.cat(*[etl.fromgff3(p) for p in gff3_path])
        else:
            tbl_features = etl.fromgff3(gff3_path)
        tbl_features = (tbl_features.unpackdict(
            'attributes', ['ID', 'Parent']).rename({
                'ID': 'feature_id',
                'Parent': 'parent_id',
                'end': 'stop'
            }).select(lambda row: (row.stop - row.start) > 0))

        # limit data to a single chromosome
        if seqid is not None:
            tbl_features = tbl_features.eq('seqid', seqid)
        self._tbl_features = tbl_features.cache()

        # index features by ID
        self._idx_feature_id = self._tbl_features.recordlookupone('feature_id')

        # index features by parent ID
        self._idx_parent_id = self._tbl_features.recordlookup('parent_id')

        # index features by genomic location
        self._idx_location = self._tbl_features.facetintervalrecordlookup(
            'seqid', 'start', 'stop', include_stop=True)
Beispiel #29
0
def main():
    try:
        strtablefile = sys.argv[1]
        if strtablefile == "-": strtablefile = "/dev/stdin"
        genomeFile = sys.argv[2]
        if genomeFile == "-": genomeFile = "/dev/stdin"
    except:
        print __doc__
        sys.exit(1)
    sys.stderr.write("loading genome...\n")
    genome = pyfasta.Fasta(genomeFile)
    refkeys = dict([(key.split()[0], key) for key in genome.keys()])

    sys.stderr.write("processing each locus...\n")
    print "\t".join(["chrom", "start", "end", "score", "GC", "entropy"])
    ProcessEachLocus(strtablefile, genome, refkeys)
def adjExplainNormal(genomeFile, outPrefix, thread, isSecond=False):
    global genome, FLdf_2, targetID_dict
    genome = pyfasta.Fasta(genomeFile)
    FLdf = pd.read_csv(outPrefix + "explainFL_Normal.txt", sep='\t')
    FLdf = FLdf.sort_values(by=["ID", "query_start"], ascending=True)
    FLdf_counts = FLdf['ID'].value_counts()
    FLdf.index = FLdf.ID
    FLdf_2 = FLdf.loc[FLdf_counts.index[FLdf_counts.values == 2], :].copy()
    FLdf_no2 = FLdf.loc[FLdf_counts.index[FLdf_counts.values != 2], :].copy()
    targetID = list(set(FLdf_2.index))
    targetID_dict = {}

    for i in range(FLdf_2.shape[0]):
        if targetID_dict.__contains__(FLdf_2.index[i]):
            targetID_dict[FLdf_2.index[i]].append(i)
        else:
            targetID_dict[FLdf_2.index[i]] = [i]

    pool = Pool(processes=thread)
    result = pool.map(getNewFL, targetID)
    pool.close()
    pool.join()
    newFL_2 = []
    for i in result:
        for j in i:
            newFL_2.append(j)
    newFL_2 = pd.DataFrame(np.array(newFL_2).reshape(-1, 12))
    newFL_2.columns = FLdf_no2.columns
    result = pd.concat([newFL_2, FLdf_no2])
    result.to_csv(outPrefix + "explainFL_Normal_adj.txt",
                  sep="\t",
                  header=True,
                  index=False)
    if isSecond:
        circOrigin = result.loc[:, ['ID', 'strand']].copy()
        strandScore = []
        for i in circOrigin.strand:
            if i == '+':
                strandScore.append(1)
            elif i == '-':
                strandScore.append(-1)
            else:
                strandScore.append(0)
        circOrigin.strand = strandScore
        circOrigin.columns = ['ID', 'score']
        circOrigin = circOrigin.drop_duplicates('ID')
        return (circOrigin)