def write_sequence(args):
    _, ext = os.path.splitext(args.fasta)
    if ext:
        ext = ext[1:]  # remove the dot from extension
    fasta = Fasta(args.fasta, default_seq=args.default_seq, strict_bounds=not args.lazy, split_char=args.delimiter)

    regions_to_fetch, split_function = split_regions(args)
    if not regions_to_fetch:
        regions_to_fetch = tuple(fasta.keys())

    for region in regions_to_fetch:
        name, start, end = split_function(region)
        if args.split_files:  # open output file based on sequence name
            filename = '.'.join(str(e) for e in (name, start, end, ext) if e)
            filename = ''.join(c for c in filename if c.isalnum() or c in keepcharacters)
            outfile = open(filename, 'w')
        else:
            outfile = sys.stdout
        try:
            for line in fetch_sequence(args, fasta, name, start, end):
                outfile.write(line)
        except FetchError as e:
            raise FetchError(e.msg.rstrip() + "Try setting --lazy.\n")
        if args.split_files:
            outfile.close()
    fasta.__exit__()
    def test_split_seq(self):
        """ Fetch sequence by blocks """
        fa = Fasta('data/chr17.hg19.part.fa')
        
        gene = Fasta("data/gene.bed12.fasta")
        expect = gene[list(gene.keys())[0]][:].seq
        
        bed = "data/gene.bed12"
        with open(bed) as fi:
            record = fi.readline().strip().split("\t")

        chrom = record[0]
        start = int(record[1])
        strand = record[5]

        # parse bed12 format
        starts = [int(x) for x in record[11].split(",")[:-1]] 
        sizes = [int(x) for x in record[10].split(",")[:-1]]
        starts = [start + x  for x in starts]
        ends = [start + size  for start,size in zip(starts, sizes)] 
        
        # bed half-open
        if strand == "-":
            starts = [start + 1 for start in starts]
        else: 
            ends = [end - 1 for end in ends]
        
        intervals = zip(starts, ends) 
        result = fa.get_spliced_seq(chrom, intervals, rc=True)
        print(result.seq)
        print("====")
        print(expect)

        assert result.seq == expect
Beispiel #3
0
def fetch(args):
    fasta = Fasta(args.fasta)
    regions = args.regions
    if args.list:
        with args.list as listfile:
            for region in listfile:
                regions.append(region.rstrip())
    for region in regions:
        region = region.split()[0]
        try:
            rname, interval = region.split(':')
        except ValueError:
            rname = region
            interval = None
        try:
            start, end = interval.split('-')
            sequence = fasta[rname][int(start) - 1:int(end)]
        except (AttributeError, ValueError):
            sequence = fasta[rname][:]
        if args.complement:
            sequence = sequence.complement
        if args.reverse:
            sequence = sequence.reverse
        line_len = fasta[rname]._fa.faidx.index[rname]['lenc']
        if args.name:
            sys.stdout.write('>' + sequence.name + '\n')
            for line in wrap_sequence(line_len, sequence.seq):
                sys.stdout.write(line)
        else:
            for line in wrap_sequence(line_len, sequence.seq):
                sys.stdout.write(line)
    fasta.close()
def main(options):
    transcripts=read_strand_file(options.strand)
    ref=Fasta(options.ref)
    for chrom in ref.keys():
        print(chrom, file=sys.stderr)
        print(">"+chrom)

        plus=np.array([False]*len(ref[chrom]))
        minus=np.array([False]*len(ref[chrom]))

	ti=0
        for transcript in transcripts["chr"+chrom]:
            if not ti % 1000:
                print("\r"+chrom+":trans"+str(ti), file=sys.stderr)
            if transcript[0]=="+":
                plus[transcript[1]:transcript[2]]=True
            elif transcript[0]=="-":
                minus[transcript[1]:transcript[2]]=True
            ti+=1

        print(chrom+":writing", file=sys.stderr)
        chrom_tx_strand = "".join(MAP[1*plus+2*minus])
        #output=textwrap.fill(chrom_tx_strand,40)
	print(chrom_tx_strand)
        print(chrom+":done", file=sys.stderr)
def calc_bkgd_counts(fasta_filename, region_size_min,
                    region_size_max, ignore_chroms,
                    only_chroms, verbose):
    ''' calculate nuc frequencies for normalization.
        Returns: dict of nucleotide frequencies.
    '''

    nuc_counts = defaultdict(Counter)

    fasta = Fasta(fasta_filename, as_raw = True)

    for chrom in fasta.keys():

        # skip data based on specified chromosomes
        if chrom in ignore_chroms: continue

        if only_chroms and chrom not in only_chroms: continue

        seq_len = len(fasta[chrom])
        for idx in range(seq_len + 1):

            for region_size in range(region_size_min,
                                     region_size_max + 1):

                nucs = fasta[chrom][idx:idx+region_size]

                nuc_counts[region_size][nucs] += 1

    # remove entries that are not equal to region_size
    for region_size, nuc_dict in nuc_counts.items():
        for nuc, count in nuc_dict.items():
            if len(nuc) != region_size:
                nuc_dict.pop(nuc)

    return nuc_counts
Beispiel #6
0
    def test_renamed(self):
        """
        Check if sequences in a FASTA file are properly renamed.
        """
        renamer = bioformats.seqname.FastaSeqRenamer()
        renamer.read_renaming_dict(self.__renaming_dict)
        with open(self.__output, "w") as output_fasta:
            for line in renamer.renamed(self.__fasta):
                output_fasta.write(line)

        # perform the reverse renaming
        rev_renamer = bioformats.seqname.FastaSeqRenamer()
        rev_renamer.read_renaming_dict(self.__renaming_dict)
        with open(self.__rev_output, "w") as rev_output_fasta:
            for line in renamer.renamed(self.__output, reverse=True):
                rev_output_fasta.write(line)

        # compare the original and reverse-renamed FASTA files
        original_fasta = Fasta(self.__fasta)
        rev_renamed_fasta = Fasta(self.__rev_output)
        for x, y in zip(original_fasta.keys(), rev_renamed_fasta.keys()):
            self.assertEqual(x, y)

        # check if the missing sequence exception is raised
        del renamer.renaming_dict["seq2"]
        with self.assertRaises(MissingSeqNameError):
            for _ in renamer.renamed(self.__fasta):
                pass

        os.unlink(self.__output)
        os.unlink(self.__rev_output)
Beispiel #7
0
def generate_sizes(name, genome_dir):
    """Generate a sizes file with length of sequences in FASTA file."""
    fa = os.path.join(genome_dir, name, "{}.fa".format(name))
    sizes = fa + ".sizes"
    g = Fasta(fa)
    with open(sizes, "w") as f:
        for seqname in g.keys():
            f.write("{}\t{}\n".format(seqname, len(g[seqname])))
Beispiel #8
0
def main(options):
    """
    Iterate and remove motif (by setting to N)
    """
    ref=Fasta(options.ref)
    reg=re.compile(motif)
    for chrom in ref.keys():
        print(">"+chrom)
        new_seq=reg.sub("N"*motif_length, ref[chrom][:].seq.upper())
        print(new_seq)
def get_prot_lens(faa_file, phage):
    len_dict={}
    digits=get_digits(faa_file)
    #def make_seq_len_dict(faa):
    f=Fasta(faa_file)
    for i in f.keys():
        name=get_locus_tag(i, digits=digits, phage=phage)
        length=len(str(f[i]))
        len_dict[name]=length
    return len_dict
Beispiel #10
0
def filter_fasta(infa, outfa, regex=".*", v=False, force=False):
    """Filter fasta file based on regex.

    Parameters
    ----------
    infa : str
        Filename of input fasta file.
    
    outfa : str
        Filename of output fasta file. Cannot be the same as infa.

    regex : str, optional
        Regular expression used for selecting sequences.

    v : bool, optional
        If set to True, select all sequence *not* matching regex.

    force : bool, optional
        If set to True, overwrite outfa if it already exists.

    Returns
    -------
        fasta : Fasta instance
            pyfaidx Fasta instance of newly created file
    """
    if infa == outfa:
        raise ValueError("Input and output FASTA are the same file.")

    if os.path.exists(outfa):
        if force:
            os.unlink(outfa)
            if os.path.exists(outfa + ".fai"):
                os.unlink(outfa + ".fai")
        else:
            raise ValueError(
                    "{} already exists, set force to True to overwrite".format(outfa))
            
    filt_function = re.compile(regex).search
    fa = Fasta(infa, filt_function=filt_function)
    seqs = fa.keys()
    if v:
        original_fa = Fasta(infa)
        seqs = [s for s in original_fa.keys() if s not in seqs]
        fa = original_fa
    
    if len(seqs) == 0:
        raise ValueError("No sequences left after filtering!")

    with open(outfa, "w") as out:
        for chrom in seqs:
            out.write(">{}\n".format(fa[chrom].name))
            out.write("{}\n".format(fa[chrom][:].seq))

    return Fasta(outfa)
def write_read_lengths_to_file(read_fasta_files, output_file):
    out=open(output_file,"w")
    out.write("fasta_file\tseq_id\tread_len\n")
    
    readfiles=list(read_fasta_files)
    for r in readfiles:
        f=Fasta(r)
        for i in f.keys():
            length=len(str(f[i]))
            fasta=r.split("/")[-1]
            sequence=i
            out.write("%s\t%s\t%s\n" % (fasta, sequence, length))
    out.close()
Beispiel #12
0
def binding_sites(kmer, genome_fp):
    genome = Fasta(genome_fp)
    locations = {}
    kmer = str(kmer)
    for record in genome.keys():
        seq = str(genome[record])
        locations[record] = substr_indices(kmer, seq)
        # append reversed primer locations as well
        locations[record] += substr_indices(revcomp(kmer), seq)
    if locations == {}:
        raise ValueError(
            "No locations for {} found in fg genome!".format(kmer))
    return locations
Beispiel #13
0
def fillgaps(consensusdict, fasta):
    """
    """
    print("filling consensus...")
    fastascaf = Fasta(fasta, mutable=True)
    for chrom in fastascaf.keys():
        for suc in consensusdict.keys():
            t1 = int(suc.split(":")[0])
            t2 = int(suc.split(":")[1])
            assert (t2 - t1) == len(fastascaf[chrom][t1:t2].seq)
#            print(consensusdict[suc])
#            print(fastascaf[chrom][t1:t2].seq)
            fastascaf[chrom][t1:t2] = consensusdict[suc]
#            print(fastascaf[chrom][t1:t2].seq)
    return(None)
Beispiel #14
0
def chromosome_ends(genome_fp):
    '''
    Returns the locations of the starts/ends of each chromosome (record) in a
    genome where all the chromosomes are concatenated (so i.e. the 2nd genome
    start site is len(1st genome), and all indices are 0-based).
    '''
    genome = Fasta(genome_fp)
    len_so_far = 0
    chr_ends = {}
    for record in genome.keys():
        chromosome = genome[record]
        chr_len = len(chromosome)
        chr_ends[record] = [len_so_far, chr_len + len_so_far - 1]
        len_so_far += chr_len
    return chr_ends
class TestFeatureKeyFunction:
    def __init__(self):
        self.fasta = os.path.join(path, 'data/genes.fasta')
        self.faidx = Faidx(self.fasta, key_function=get_gene_name)
        self.genes = Fasta(self.fasta, key_function=get_gene_name)

    def test_keys(self):
        expect = ['BARD1', 'FGFR2', 'KF435149.1', 'MDM4', 'NM_000465.3', 'NM_001282543.1', 'NM_001282545.1', 'NM_001282548.1', 'NM_001282549.1', 'NR_104212.1', 'NR_104215.1', 'XM_005249642.1', 'XM_005249643.1', 'XM_005249644.1', 'XM_005249645.1', 'XM_005265507.1', 'XM_005265508.1', 'XR_241079.1', 'XR_241080.1', 'XR_241081.1']
        result = sorted(self.genes.keys())
        assert result == expect

    def test_key_function_by_dictionary_get_key(self):
        expect = 'TTGAAGATTTTGCATGCAGCAGGTGCGCAAGGTGAAATGTTCACTGTTAAA'
        result = self.genes['MDM4'][100-1:150]
        assert str(result) == expect

    def test_key_function_by_fetch(self):
        expect = 'TTGAAGATTTTGCATGCAGCAGGTGCGCAAGGTGAAATGTTCACTGTTAAA'
        result = self.faidx.fetch('MDM4',
                             100, 150)
        assert str(result) == expect

    @raises(ValueError)
    def test_duplicated_keys(self):
        genes = Fasta(self.fasta, key_function=get_duplicated_gene_name)
Beispiel #16
0
def generate_gap_bed(fname, outname):
    """ Generate a BED file with gap locations.

    Parameters
    ----------
    fname : str
        Filename of input FASTA file.

    outname : str
        Filename of output BED file.
    """ 
    f = Fasta(fname)
    with open(outname, "w") as bed:
        for chrom in f.keys():
            for m in re.finditer(r'N+', f[chrom][:].seq):
                bed.write("{}\t{}\t{}\n".format(chrom, m.start(0), m.end(0)))
Beispiel #17
0
    def __init__(self, remote=False):
        if not remote:
            try:
                self.ref = Fasta(settings.REFERENCE_PATH)
            except IOError:
                self.ref = _RemoteReference(settings.BUILD)
        else:
            self.ref = _RemoteReference(settings.BUILD)

        # Add a get method. This will not be sensitive to "chr" prefixes.
        def get(fasta, chrom):
            chr_prefix = chrom.startswith("chr")
            try:
                return fasta[chrom]
            except KeyError:
                pass
            try:
                # If there was a prefix, we try without.
                if chr_prefix:
                    return fasta[chrom[3:]]
                # If there was no prefix, we try with.
                else:
                    return fasta["chr{}".format(chrom)]
            except KeyError:
                # If it is a true mismatch, we return None.
                return None

        self.ref.get = functools.partial(get, self.ref)
 def test_get_seq_rc(self):
     """ Check get_seq with rc argument """
     fa = Fasta('data/chr17.hg19.part.fa')
     
     result = fa.get_seq("chr17", 11, 20, rc=False)
     expect = "CCCTGTTCCT"
     print("normal")
     print(result.seq)
     print(expect)
     assert result.seq == expect
     
     result = fa.get_seq("chr17", 11, 20, rc=True)
     expect = "AGGAACAGGG"
     assert result.seq == expect
     print("rc")
     print(result.seq)
     print(expect)
def readFASTA(x, splitKey = None):
    """
    Is sequence file? Load from file if so. File should be FASTA format
    Use pyfasta
    """

    if type(x) is not str:
        raise TypeError("input must be type str. filename or sequence")
    if os.path.isfile(x):
        tmp_o = Fasta(x, key_function=lambda key: key.split()[0])
        if (splitKey is None):
            o = tmp_o
        else:
            o = { i.split(splitKey)[0] : tmp_o[i] for i in tmp_o.keys() }
    else:
        o = x
    return o
Beispiel #20
0
def fasta_stats(fasta_fp):
    """
    Retrieves the number of bases and number of records in a FASTA file. Also
    creates a FASTA index (.fai) for later searching. May be slow for very large
    files.
    """
    # pyfaidx can't handle blank lines within records, so we have to check :(
    check_empty_lines(fasta_fp)
    try:
        fasta = Fasta(fasta_fp)
        length = fasta_len_quick(fasta_fp)
        nrecords = len(fasta.keys())
        return length, nrecords
    except:
        click.secho(
            "\nError reading %s: invalid FASTA format?" % fasta_fp, fg="red")
        raise
 def test_get_sequence_bed(self):
     expected = "GGATGGTGTGGTAG"
     coords = ["chr2", 13, 27, "ENSMUST7", ".", "+"]
     with Fasta("tests/test_genome.fa") as pf_genome:
         observed = get_sequence(coords,
                                 pf_genome,
                                 bed_input=True,
                                 strip_chr=True)
     self.assertEqual(observed, expected)
Beispiel #22
0
 def test_keys(self):
     fasta = Fasta('data/genes.fasta',
                   split_char='|',
                   duplicate_action="drop")
     expect = [
         '530364724', '530364725', '530364726', '530373235', '530373237',
         '530384534', '530384536', '530384538', '530384540', '543583738',
         '543583740', '543583785', '543583786', '543583788', '543583794',
         '543583795', '543583796', '557361097', '557361099', '563317589',
         'AB821309.1', 'KF435149.1', 'KF435150.1', 'NM_000465.3',
         'NM_001282543.1', 'NM_001282545.1', 'NM_001282548.1',
         'NM_001282549.1', 'NR_104212.1', 'NR_104215.1', 'NR_104216.1',
         'XM_005249642.1', 'XM_005249643.1', 'XM_005249644.1',
         'XM_005249645.1', 'XM_005265507.1', 'XM_005265508.1',
         'XR_241079.1', 'XR_241080.1', 'XR_241081.1', 'dbj'
     ]
     result = sorted(fasta.keys())
     assert result == expect
Beispiel #23
0
def Find_Random_panhandles(path_to_intervals, energy_threshold, handle_length_threshold, panhandle_length_threshold, k,
                    genome_file, threads, need_suboptimal, kmers_stacking_matrix, N_seeds, strandness, what):
    start_time = time.time()
    df = pd.read_csv(path_to_intervals, sep='\t')
    df["gene_chr_start_end_strand"] = df.chr + "_" + df.start_gene.map(str) + "_" + df.end_gene.map(str) + "_" + df.strand
    df["interval_chr_start_end_strand"] = df["chr"] + "_" + df["start_interval"].map(str) + "_" + df["end_interval"].map(str) + "_" + df["strand"]
    df['start_interval'] = df['start_interval'].astype(int)
    df['end_interval'] = df['end_interval'].astype(int)
    if not ('sequences' in list(df.columns.values)):
        print('Attaching sequences..')
        genome = Fasta(genome_file)
        GetSequencesForDF2 = partial(GetSequencesForDF, genome)
        df.loc[:, 'sequences'] = df.apply(GetSequencesForDF2, axis=1)
        if strandness:
            print("Making complement of minus strand..")
            df.loc[:, 'sequences'] = df.apply(MakeComplement, axis=1)         
        df.to_csv("../out/intervals_with_seqs.tsv", sep='\t', index=False)
    df.sequences = map(lambda x: x.upper(), df['sequences'])

    for seed in range(1, N_seeds + 1):  
        print(seed)  
        ##shuffle
        df_new = df.copy() 
        df_new = Shuffle(df_new, what, seed)
        df_new["sequences_indxd"] = df_new['sequences'].apply(lambda x: Index_seq(x, k))
        df_new = df_new.loc[df_new.sequences_indxd != False]
        print("Creating files..")
        with open('../out/genes_done2.txt', 'w') as done_f:
            done_f.write('Started alignment: \n')
        results_one_gene_table = pd.DataFrame(
            {'gene': [], 'energy': [],
             'start_al1': [], 'end_al1': [], 'start_al2': [], 'end_al2': [],
             'alignment1': [], 'alignment2': [], 'structure': [], 'interval1': [], 'interval2': []})
        with open('../out/random_panhandles' + str(seed) + '.tsv', 'w') as f:
            results_one_gene_table.to_csv(f, sep='\t', index=False, header=True)
        with open('../out/counts_close_' + str(seed) + '.txt', 'w') as f:
            f.write('')
        print('Start to align..')
        p = mp.Pool(processes=threads)
        print('Created pool')
        m = mp.Manager()
        print('Created manager')
        lock = m.Lock()
        print('Created lock')
        f_shuffled = '../out/intervals_shuffled_' + str(seed) + '.tsv'
        df_new.to_csv(f_shuffled, sep='\t', index=False, header=True)
        Find_panhandles_one_gene2 = partial(Find_panhandles_one_gene, lock, df_new, energy_threshold, handle_length_threshold,
                                            panhandle_length_threshold, k, need_suboptimal, kmers_stacking_matrix, seed)
        print('Created partial')
        genes = df_new["gene_chr_start_end_strand"].unique()
        print('Created genes')
        p.map(Find_panhandles_one_gene2, genes)
        p.close()
        p.join()
    print("all done!")
    print(time.time() - start_time)
    return (0)
Beispiel #24
0
def get_gene_sequences(parent_dict, ref_chroms, args, liftover_type):
    fai = Fasta(args.r)
    if liftover_type == "unplaced":
        open(args.dir + "/unplaced_genes.fa", 'w')
    for chrom in ref_chroms:
        fasta_out = get_fasta_out(chrom, args.r, liftover_type, args.dir)
        sorted_parents = sorted(list(parent_dict.values()), key=lambda x: x.seqid)
        write_gene_sequences_to_file(chrom, args.r, fai, sorted_parents, fasta_out)
        fasta_out.close()
Beispiel #25
0
 def from_linear_reference(cls,
                           fasta_file_name,
                           reference_name="ref",
                           k=15,
                           only_store_kmers=False):
     logging.info("Only store kmers? %s" % only_store_kmers)
     logging.info("k=%d" % k)
     genome_sequence = str(Fasta(fasta_file_name)[reference_name])
     return cls.from_sequence(genome_sequence, k, only_store_kmers)
Beispiel #26
0
 def __init__(self, filename):
     """ filename example: pacbio_new_gene_model.all_phase_peptide """
     self.filename = filename
     self.peptidefasta = Fasta("../data/pacbio/" + filename + ".fasta")
     print(self.peptidefasta)
     self.signalplines = get_lines("../data/pacbio",
                                   filename + ".fasta.signalp")
     self.position2manualinfo = get_position2manualinfo(
         "../data/pacbio/pacbio_new_gene_model.tab")
Beispiel #27
0
    def process_txt(self):
        """
        process tab-delimited text file, containing the following columns:
        CHR    POS    REF    ALT    SAMPLE_ID

        """

        fasta_reader = Fasta(self.args.fastafile, read_ahead=1000000)

        nbp = (self.args.length - 1) // 2
        samples_dict = {}

        numsites_keep = 0
        numsites_skip = 0
        chrseq = '0'

        with open(self.args.input, 'r') as txt_file:
            reader = csv.reader(txt_file, delimiter='\t')

            for row in reader:
                chrom = row[0]
                pos = int(row[1])
                ref = row[2]
                alt = row[3]
                sample = row[4]

                if sample not in samples_dict:
                    samples_dict[sample] = self.subtypes_dict.fromkeys(
                        self.subtypes_dict, 0)

                if chrom != chrseq:
                    sequence = fasta_reader[chrom]
                    chrseq = chrom

                if (len(alt) == 1 and len(ref) == 1):
                    mu_type = ref + alt
                    category = getCategory(mu_type)
                    if nbp > 0:
                        lseq = sequence[pos - (nbp + 1):pos + nbp].seq
                    else:
                        lseq = sequence[pos - 1].seq
                        # eprint("lseq:", lseq)
                    motif_a = getMotif(lseq)
                    subtype = str(category + "." + motif_a)

                    if subtype not in self.subtypes_dict:
                        continue

                    samples_dict[sample][subtype] += 1

            mdf = pd.DataFrame(samples_dict).T.fillna(0)
            samples = mdf.index.tolist(
            )  #instead of using samples_dict with sorted(), which leads to mismatching, simply retain the explicit ordering of the matrix dataframe.
            M = mdf.values

        out = collections.namedtuple('Out', ['M', 'samples'])(M, samples)
        return out
Beispiel #28
0
 def test_fetch_whole_fasta(self):
     expect = [
         line.rstrip('\n') for line in open('data/genes.fasta')
         if line[0] != '>'
     ]
     result = list(
         chain(*([line for line in record]
                 for record in Fasta('data/genes.fasta.gz', as_raw=True))))
     assert expect == result
Beispiel #29
0
def aligned_long_reads(input_sam_file, reference, output): #Get transcript sequences from 'paired-ended' reads, reference genome required
	ref = Fasta(reference)
	with open(output,'w') as f2:
		with open(input_sam_file,'rb') as f1:
			for line in f1:
				if not line.startswith("@"):
					alignment = line.strip().split('\t')
					ID = alignment[0]
					FLAG = alignment[1]
					chromesome = alignment[2]
					left = int(alignment[3])
					Read_length = len(alignment[9])
					PE_size = int(alignment[8])
					Read = Seq(alignment[9])
					if FLAG == "99":
						first_sequence = Read
						next_alignment = f1.next().strip().split('\t')
						next_ID = next_alignment[0]
						next_FLAG = next_alignment[1]
						next_sequence = Seq(next_alignment[9])
						next_MDZ = next_alignment[12].split(':')[-1]
						if next_ID == ID and next_FLAG == "147":
							if PE_size <= 160:
								head_length = tail_length = PE_size - 80
								shared_length = 160 - PE_size
								if first_sequence[80 - shared_length:80] == next_sequence[0:shared_length]:
									sequence = first_sequence[0:head_length] + next_sequence
								else:
									continue
							else:
								gap = PE_size - 160
								sequence = first_sequence + Seq(ref[chromesome][left-1 + 80: left-1 + 80 + gap].seq) + next_sequence
							size = len(sequence)
							f2.write('%s\t%s\t%d\t%s\n' %(ID, sequence, size, "forward"))
						else:
							continue
					elif FLAG == "163":
						first_sequence = Read						
						next_alignment = f1.next().strip().split('\t')
						next_ID = next_alignment[0]
						next_FLAG = next_alignment[1]
						next_sequence = Seq(next_alignment[9])
						if next_ID == ID and next_FLAG == "83":
							if PE_size <= 160:
								head_length = tail_length = PE_size - 80
								shared_length = 160 - PE_size
								if first_sequence[80 - shared_length:80] == next_sequence[0:shared_length]:
									sequence = (first_sequence[0:head_length] + next_sequence).reverse_complement()
								else:
									continue
							else:
								gap = PE_size - 160
								sequence = (first_sequence + Seq(ref[chromesome][left-1 + 80: left-1 + 80 + gap].seq) + next_sequence).reverse_complement()
							size = len(sequence)
							f2.write('%s\t%s\t%d\t%s\n' %(ID, sequence, size, "reverse"))
						else:
							continue
 def test_slice_whole_entry(self):
     fasta = Fasta('data/genes.fasta')
     if test_bio:
         with open('data/genes.fasta', "rU") as fh:
             seqio = SeqIO.to_dict(SeqIO.parse(fh, "fasta"))
         assert str(fasta['gi|557361099|gb|KF435150.1|'][::3]) == str(
             seqio['gi|557361099|gb|KF435150.1|'].seq[::3])
     else:
         raise SkipTest
Beispiel #31
0
 def test_reverse_missing_ref(self):
     ''' check that reverse  works correctly
     '''
     genome = Fasta(self.fa)
     var = self.Var(chrom='N', pos=1, ref='N', alts=['.'])
     rev = reverse_var(var, genome)
     self.assertEqual(rev.ref, 'N')
     self.assertEqual(rev.alts, ['.'])
     self.assertEqual(rev.pos, 1)
 def splitGenome(self):
     fa = Fasta(self.fastaFileName)
     for seq in fa:
         with open('{}{}.fa'.format(STANDARD_GENOME_PATH, seq.name),
                   'w') as out:
             out.write('>{}\n'.format(seq.name))
             for line in wrap_sequence(70, str(seq)):
                 out.write(line)
     print("<<<<<<<Splitted>>>>>>")
Beispiel #33
0
def split_target_sequence(target_chroms, target_fasta_name, inter_files):
    Faidx(target_fasta_name)
    target_fasta_dict = Fasta(target_fasta_name,
                              key_function=lambda x: x.split()[0])
    for chrm in target_chroms:
        if chrm != target_fasta_name:
            out = open(inter_files + "/" + chrm + ".fa", 'w')
            out.write(">" + chrm + "\n" + str(target_fasta_dict[chrm]))
    return target_fasta_dict
Beispiel #34
0
class FastaWrapper(GenomeWrapper):
    def __init__(self,
                 fasta_file,
                 alpha='dna',
                 one_hot=True,
                 channel_last=True,
                 in_mem=False,
                 thread_safe=False,
                 read_ahead=10000):
        super().__init__(alpha, one_hot, channel_last, in_mem, thread_safe)
        self.fasta = Fasta(fasta_file,
                           as_raw=True,
                           sequence_always_upper=True,
                           read_ahead=read_ahead)
        self._chroms = list(self.fasta.keys())
        seq_lens = [len(self.fasta[chrom]) for chrom in self._chroms]
        self._chroms_size = dict(zip(self._chroms, seq_lens))
        self.read_ahead = read_ahead
        if in_mem:
            fasta_onehot_dict = self._encode_seqs(self.fasta)
            self.fasta.close()
            self.fasta = fasta_onehot_dict
            self.thread_safe = True
        else:
            if thread_safe:
                self.fasta.close()
                self.fasta = fasta_file

    def close(self):
        if not self.thread_safe:
            self.fasta.close()

    @staticmethod
    def _encode_seqs(fasta):
        # Converts a FASTA object into a dictionary of one-hot coded boolean matrices
        fasta_dict = {}
        pbar = tqdm(fasta)
        for record in pbar:
            pbar.set_description(desc='Loading sequence: ' + record.name)
            seq = record[:]
            seq = np.array(list(seq))
            fasta_dict[record.name] = seq
        return fasta_dict

    def _get_seq(self, chrom, start, stop):
        if self.in_mem:
            seq = self.fasta[chrom][start:stop]
        else:
            if self.thread_safe:
                fasta = Fasta(self.fasta,
                              as_raw=True,
                              sequence_always_upper=True,
                              read_ahead=self.read_ahead)
                seq = np.array(list(fasta[chrom][start:stop]))
                fasta.close()
            else:
                seq = np.array(list(self.fasta[chrom][start:stop]))
        return seq
Beispiel #35
0
 def get_genomic_context(df, genome, n=85):
     """get the genomic context of variants
        necessary input columns: ['CHROM', 'POS']; added output columns: ['SEQ', 'SEQ_LONG']"""
     for i in df.index:
         df.at[i, 'SEQ_LONG'] = Fasta(genome)[str(
             df.at[i,
                   'CHROM'])][(df.at[i, 'POS'] -
                               (n + 101)):(df.at[i, 'POS'] + (n + 100))].seq
         df.at[i, 'SEQ'] = df.at[i, 'SEQ_LONG'][100:-100]
     return df
Beispiel #36
0
def gc_correct(input, output, reference, frac_n, frac_r, iter, frac_lowess):
    fasta = Fasta(reference)
    bed_lines = [
        BedLine(*map(attempt_numeric, x.split("\t"))) for x in open(input)
    ]
    corrected = correct(bed_lines, fasta, frac_n, frac_r, iter, frac_lowess)

    with open(output, "wb") as ohandle:
        for line in corrected:
            ohandle.write(bytes(str(line) + "\n", 'utf-8'))
Beispiel #37
0
def extract_chromosome_data(chromosome_key, start, end):
    data_path = path_for_chromosome_data(chromosome_key)

    if not data_path.exists():
        raise Exception("Chromosome data not downloaded")

    all_data = Fasta(str(data_path))
    sliced_data = all_data[chromosome_key][start:end].seq

    return sliced_data
Beispiel #38
0
 def __init__(self, strain, feature):
     self.strain = strain
     self.feature = feature
     self.get_feature_ids()
     self.get_scaffold2len()
     self.get_id2left_right()
     self.get_id2distance()
     self.scaffoldfasta = Fasta("data/" + self.strain +
                                "_CLC_de_novo_rmhost_mod.fa",
                                as_raw=True)
Beispiel #39
0
 def read_names_to_csv(file_pathway, sample_name):
     """
     This function takes in the fasta files and pulls the read names and then outputs them into csv files.
     :param file_pathway: Pathway to Fasta files
     :param sample_name: Name of current sample being loaded (i.e. bio_mom)
     :return: The output filename
     """
     output_filename = f"./read_names/{cross_used}/{sample_name}.csv"
     output_df = pd.DataFrame()
     print(f"-- Loading {sample_name} Fasta file --")
     file_read = Fasta(file_pathway)
     print(f"-- Loaded {sample_name} Fasta file --")
     file_read_names_list = [name for name in file_read.keys()]
     file_read_names_list.sort()
     output_df['names'] = file_read_names_list
     print(f"-- Output {sample_name} read names to {sample_name}.csv --")
     output_df.to_csv(output_filename, sep=',', index=False)
     print(f'-- {sample_name} finished --')
     return output_filename
Beispiel #40
0
def parse_hgvs(hgvs_name, fasta, genes):
    genome = Fasta(fasta, key_function=lambda x: 'chr{}'.format(x))

    with open(genes) as infile:
        transcripts = hgvs_utils.read_transcripts(infile)

    def get_transcript(name):
        return transcripts.get(name)

    return hgvs.parse_hgvs_name(hgvs_name, genome, get_transcript=get_transcript)
Beispiel #41
0
def INDEX_GENOME(OUTDIR, GENOME_FILE):
	LOGGER.info('Indexing the genome')
	GENOMEIDX = Fasta(GENOME_FILE)
	GENOMEPREFIX = os.path.splitext(GENOME_FILE)[0]
	FAIDX = pd.read_csv(GENOME_FILE + '.fai', sep='\t', names=['SCAFFOLD', 'SCAFF_LENGTH', 'three', 'four', 'five'])
	#FAIDX = FAIDX[['SCAFFOLD', 'SCAFF_LENGTH']]
	FILE = GENOMEPREFIX + '.fai'
	INDEX = os.path.join(OUTDIR, FILE)
	FAIDX.to_csv(INDEX, sep='\t', header=False, index=False)
	return INDEX
Beispiel #42
0
    def post(self):
        gene_ids = request.get_json(force=True)['gene_ids']
        edit = request.get_json(force=True)['edit']
        genome = request.get_json(force=True)['genome']
        if not gene_ids:  # TODO improve
            raise BadRequest('gene_ids not set')

        if genome not in ['hg19', 'mm10']:  #
            raise BadRequest(f'{genome} not supported')

        if edit and len(gene_ids) != 1:
            raise BadRequest('gene_ids needs to have length 1 if editing..')

        # TODO here goes all the computation for checking wether SNP and CNSD
        # influence the guides. For now return the 6 best guides
        aggregation_pipeline = [
            # filter our genes
            {
                '$match': {
                    '$and': [{
                        'gene_id': {
                            '$in': gene_ids
                        }
                    }, {
                        'genome': genome
                    }]
                }
            },
            # unwind guides so we can access their score
            # {'$unwind': '$guides'},
            # # sort by score
            # {'$sort': {'guides.score': -1}},
            # # group guides together again (contrary of unwind)
            # {'$group': {
            #     '_id': '$_id',
            #     'gene_id': {'$first': '$gene_id'},
            #     'chromosome': {'$first': '$chromosome'},
            #     'pdbs': {'$first': '$pdbs'},
            #     'exons': {'$first': '$exons'},
            #     'guides': {'$push': '$guides'}
            # }},
        ]
        result = list(guide_collection.aggregate(aggregation_pipeline))
        if edit:
            df = gencode_exons(genome)
            exons = df[(df.gene_id == gene_ids[0])]
            chromosome = exons.seqname.iloc[0]
            # TODO here i have to change things..
            fasta = Fasta(GENOME_FILE.format(GENOME), as_raw=True)

            seq = fasta[chromosome][min(exons.start):max(exons.end)]
            # if self.strand == '-':  # i think this is done on the client...
            #     seq = seq.reverse.complement
            result[0]['sequence'] = seq
        return result
Beispiel #43
0
class FastaOnehot:
    def __init__(self, file=None, seqlen=1000):
        self.fa = Fasta(file, sequence_always_upper=True, as_raw=True)
        self.l = len(self.fa.keys())
        self.seqlen = seqlen
        self.dna_encoder = LabelEncoder().fit(array(['A', 'C', 'G', 'N', 'T']))
        self.onehot_encoder = OneHotEncoder(sparse=False).fit(
            array(list(range(0, 5))).reshape(-1, 1))

    def toOnehot(self, chunksize=10000):
        k = list(self.fa.keys())
        r = 0
        i = 0
        while r < self.l:
            seqnames = k[i * chunksize:min((i + 1) * chunksize, self.l)]
            seq = [
                array(list(self.fa[x][:].ljust(1000, 'N'))) for x in seqnames
            ]
            int_encoded = [self.dna_encoder.transform(s) for s in seq]
            int_encoded = [s.reshape(len(s), 1) for s in int_encoded]
            onehot_encoded = array(
                [self.onehot_encoder.transform(s) for s in int_encoded])
            r = min((i + 1) * chunksize, self.l)
            i += 1
            print('last record : ' + seqnames[-1])
            yield (seqnames, onehot_encoded)

    def toh5(self, file="onehot.h5", chunksize=10000):
        with h5py.File(file, 'w') as f:
            cs = min(chunksize, self.l)
            f.create_dataset(
                "seqnames",
                data=[np.string_(s) for s in list(self.fa.keys())],
                chunks=(cs, ))
            f.create_dataset("onehot",
                             shape=(self.l, self.seqlen, 5),
                             maxshape=(None, self.seqlen, 5),
                             chunks=(cs, self.seqlen, 5))
            i = 0
            for n, o in self.toOnehot(chunksize=chunksize):
                f["onehot"][(i * cs):min(((i + 1) * cs), self.l)] = o
                i += 1
Beispiel #44
0
def raw_error_rate(fig_fn):
	n = 0
	tmp_out = os.path.dirname(os.path.abspath(fig_fn)) + '/raw_cons_error.out'
	for sample, read_fn, ref_fn, info_fn, cons_ep_fn in zip(samples, read_fas, ref_fns, cons_info_fns, cons_ep_fn):
		read_fa = Fasta(read_fn)
		ref_fa = Fasta(ref_fn)
		with open(ref_fn) as ref_fp, open(cons_ep_fn) as cons_ep_fp, open(info_fn) as info_fp, open(tmp_out, 'w') as out_fp:
			out_fp.write('Sample\tCopyNum\tRawError\tConsError\n')
			last_name = ''
			for cons_name in ref_fa.keys():
				read_name = cons_name.rsplit('_')[0]
				if read_name == last_name:
					continue
				copy_num, raw_error, cons_error = 0, 0, 0
				ref_seq = ref_fa[cons_name][:].seq.upper()
				read_seq = read_fa[read_name][:].seq.upper()
				raw_error = get_mp_error_rate(ref_seq, read_seq)
				if raw_error < 0: continue

				for eline in cons_ep_fp:
					if eline.startswith('#'): continue
					ele = iline.rsplit()
					name, error = ele[ep_idx['#READ_NAME']], ele[ep_idx['ERR_RATE']][:-1]/100.0
					if name == cons_name:
						cons_error = error
					else:
						continue

				for sline in info_fp:
					ele = sline.rsplit()
					name, num = ele[info_idx['CONS_NAME']], ele[info_idx['COPY_NUM']]
					if name == cons_name:
						copy_num = int(num)
					else:
						continue
				out_fp.write('{}\t{}\t{}\t{}\n'.format(sample, copy_num, raw_error, cons_error))
				last_name = read_name
				n+=1
				if n== 10:
					sys.exit(1)
	cmd = 'Rscript /home/gaoy1/program/circ_plot/error_rate.R {} {}'.format(ep_fn, fig_fn)
	print(cmd)
def get_transcripts(reference_file, transcript_file, vcf_file):
    """Take a FASTA reference file and a VCF file, and generate a FASTA file
    with changes from the vcf file"""
    shutil.copyfile(reference_file, transcript_file)
    transcripts = Fasta(transcript_file, mutable=True)
    with open(vcf_file) as f:
        for (accession, pos, ref, alt) in get_variations(f):
            if accession not in transcripts:
                raise ValueError('VCF accession {0} not found in reference'.\
                                 format(accession))
            transcripts[accession][(pos - 1):pos] = alt
    def test_ncbiseqrename_fasta(self):
        """
        Check if NCBI sequence names in a FASTA file are properly
        changed.
        """
        sys.argv = ['', self.__fasta, 'genbank', self.__output, 'ucsc',
                    '--chr', self.__chr, '--unloc', self.__unloc,
                    '--unpl', self.__unpl, '--fasta']

        bioformats.cli.ncbirenameseq()

        # check if the obtained and original files are the same
        original_fasta = Fasta(self.__ucsc_fasta)
        renamed_fasta = Fasta(self.__output)
        for x, y in zip(original_fasta.keys(), renamed_fasta.keys()):
            self.assertEqual(x, y)

        os.unlink(self.__ucsc_fasta + '.fai')
        os.unlink(self.__output)
        os.unlink(self.__output + '.fai')
Beispiel #47
0
def read_pep_fa(protein_file):
    import pandas as pd
    proteins = Fasta(str(protein_file))
    pl = []
    for v in proteins:
        names = v.long_name.split(" ", 8)
        d = {"protein_id": names[0], 'protein_type': names[1]}
        d = {**d, **dict([n.split(":", 1) for n in names[2:]])}
        d['seq'] = str(proteins[v.name])
        pl.append(d)
    return pd.DataFrame(pl)
Beispiel #48
0
def fasta_extract_regions(fa_fname, intervals):
    """Extract an iterable of regions from an indexed FASTA file.

    Input: FASTA file name; iterable of (seq_id, start, end) (1-based)
    Output: iterable of string sequences.
    """
    with Fasta(fa_fname, as_raw=True) as fa_file:
        for chrom, rows in groupby(intervals, lambda cse: cse[0]):
            logging.info("Extracting sequences from chromosome %s", chrom)
            for _chrom, start, end in rows:
                yield fa_file[_chrom][start:end]
Beispiel #49
0
def processMAF(args, subtypes_dict):
    
    fasta_reader = Fasta(args.fastafile, read_ahead=1000000)
    
    nbp = (args.length-1)//2
    samples_dict = {}

    # M = np.zeros((len(samples), len(subtypes_dict)))
    numsites_keep = 0
    numsites_skip = 0
    chrseq = '0'

    f = open(args.input, 'r', encoding = "ISO-8859-1")

    reader = csv.DictReader(filter(lambda row: row[0]!='#', f), delimiter='\t')
    counter = 0
    for row in reader:

        if(row['Variant_Type'] != "SNP"): continue
            
        pos = int(row['Start_position'])
        ref = row['Reference_Allele']
        alt = row['Tumor_Seq_Allele2']
        sample = row[args.groupvar]
        
        if row['Chromosome'] != chrseq:
            sequence = fasta_reader[row['Chromosome']]
            chrseq = row['Chromosome']
        
        counter += 1
        mu_type = ref + alt
        category = getCategory(mu_type)
        lseq = sequence[pos-(nbp+1):pos+nbp].seq
        
        motif_a = getMotif(pos, lseq)
        subtype = str(category + "." + motif_a)
        st = subtypes_dict[subtype]

        if sample not in samples_dict:
            samples_dict[sample] = {}

        if subtype not in samples_dict[sample]:
            samples_dict[sample][subtype] = 1
        else:
            samples_dict[sample][subtype] += 1

        if (counter%1000 != 0): continue
        util_log.debug(args.input + ": " + str(counter) + " sites counted")

    M = DataFrame(samples_dict).T.fillna(0).values
    samples = sorted(samples_dict)

    out = collections.namedtuple('Out', ['M', 'samples'])(M, samples)
    return out
    def set_peak_sequences_using_fasta(self,
                                       fasta_file_location="grch38.fasta"):
        logging.info("Setting peak sequences using fasta index")
        genome = Fasta(fasta_file_location)
        i = 0
        for peak in self.peaks:
            if i % 10000 == 0:
                logging.info("%d/%d peaks processed" % (i, len(self.peaks)))
            i += 1

            peak.set_sequence_using_fasta_index(genome)
Beispiel #51
0
def generate_fasta(intersection_bedtool, fasta_filename, revcomp, verbose):

    if verbose:
        print >> sys.stderr, ">> generating fasta of positions ..."

    # -s: force strandedness
    fasta_seqs = intersection_bedtool.sequence(fi=fasta_filename, s=True)

    fasta = Fasta(fasta_seqs.seqfn)

    return fasta
def construct_hg38_map(n2nl_aln, hg38_bam):
    """Constructs a map of hg38 position -> sequence alignment position -> MSA position"""
    # construct sequence alignment position -> MSA position map using the MSA
    aln_f = Fasta(n2nl_aln)
    seq_aln_map = defaultdict(dict)
    for name, seq in aln_f.iteritems():
        seq_pos = 0
        for aln_pos, x in enumerate(str(seq)):
            seq_aln_map[name][seq_pos] = aln_pos
            if x != '-':
                seq_pos += 1

    # find maximum position for reversing negative strand
    max_pos = {x: max(y.keys()) for x, y in seq_aln_map.iteritems()}

    # construct a hg38 -> sequence positions using the sequences trivially mapped back to hg38
    hg38_map = {}
    for rec in pysam.Samfile(hg38_bam):
        m = {y: x for x, y in rec.aligned_pairs}
        # invert positions for negative strand genes
        if rec.qname in ['NOTCH2', 'NOTCH2NL-A', 'NOTCH2NL-B']:
            m = {x: max_pos[rec.qname] - y for x, y in m.iteritems()}
        hg38_map[rec.qname] = m

    # construct a table mapping each alignment position to all hg38 positions
    r = defaultdict(dict)
    for name, pos_map in hg38_map.iteritems():
        for hg38_pos, seq_pos in pos_map.iteritems():
            aln_pos = seq_aln_map[name][seq_pos]
            r[name][aln_pos] = hg38_pos

    # now invert this map, so that we have our hg38 -> aln map
    final_map = {}
    for name in r:
        for aln_pos in r[name]:
            hg38_pos = r[name][aln_pos]
            assert hg38_pos not in final_map
            final_map[hg38_pos] = aln_pos

    return final_map
Beispiel #53
0
def size(args):
    if args.header:
        print("seqid\tsize")
    fname, fext = op.splitext(args.fi)
    if args.fi in ['stdin', '-'] or fext in ['.gz','.bz2']:
        fh = must_open(args.fi)
        for rcd in SeqIO.parse(fh, "fasta"):
            sid, size = rcd.id, len(rcd)
            if args.bed:
                print("%s\t%d\t%d" % (sid, 0, size))
            else:
                print("%s\t%d" % (sid, size))
    elif fext in [".%s" % x for x in FastaExt]:
        from pyfaidx import Fasta
        fas = Fasta(args.fi)
        for sid in fas.keys():
            size = len(fas[sid])
            if args.bed:
                print("%s\t%d\t%d" % (sid, 0, size))
            else:
                print("%s\t%d" % (sid, size))
    else:
        logging.error("%s is not a supported format" % fext)
def split_fasta(number_files, fasta_file):
    try:
        fasta=Fasta(fasta_file)
    except:
        print "could not open fasta"
        exit()
    number_seqs=len(fasta.keys())
    splits=int(np.ceil(number_seqs/number_files))
    #print(splits)
    ranges=range(0, number_seqs, splits)
    print(ranges)
    ranges[-1]=number_seqs
    print(ranges)

    
    for i in range(0, number_files):
        start=ranges[i]
        stop=ranges[i+1]
        label=re.sub(r"\.fa.*","."+str(i+1)+".fasta", fasta_file)
        out=open(label,"w")
        
        for f in fasta.keys()[start:stop]:
            out.write(">"+f+"\n"+str(fasta[f])+"\n")
        out.close()
Beispiel #55
0
def write_sequence(args):
    _, ext = os.path.splitext(args.fasta)
    if ext:
        ext = ext[1:]  # remove the dot from extension
    filt_function = re.compile(args.regex).search
    fasta = Fasta(args.fasta, default_seq=args.default_seq, strict_bounds=not args.lazy, split_char=args.delimiter, filt_function=filt_function, rebuild=not args.no_rebuild)

    regions_to_fetch, split_function = split_regions(args)
    if not regions_to_fetch:
        regions_to_fetch = fasta.keys()
    if args.invert_match:
        sequences_to_exclude = set([split_function(region)[0] for region in regions_to_fetch])
        fasta = Fasta(args.fasta, default_seq=args.default_seq, strict_bounds=not args.lazy, split_char=args.delimiter, rebuild=not args.no_rebuild)
        regions_to_fetch = (key for key in fasta.keys() if key not in sequences_to_exclude)
        split_function = ucsc_split

    header = False
    for region in regions_to_fetch:
        name, start, end = split_function(region)
        if args.size_range:
            if start is not None and end is not None:
                sequence_len = end - start
            else:
                sequence_len = len(fasta[name])
            if args.size_range[0] > sequence_len or args.size_range[1] < sequence_len:
                continue
        if args.split_files:  # open output file based on sequence name
            filename = '.'.join(str(e) for e in (name, start, end, ext) if e)
            filename = ''.join(c for c in filename if c.isalnum() or c in keepcharacters)
            outfile = open(filename, 'w')
        elif args.out:
            outfile = args.out
        else:
            outfile = sys.stdout
        try:
            if args.transform:
                if not header and args.transform == 'nucleotide':
                    outfile.write("name\tstart\tend\tA\tT\tC\tG\tN\n")
                    header = True
                outfile.write(transform_sequence(args, fasta, name, start, end))
            else:
                for line in fetch_sequence(args, fasta, name, start, end):
                    outfile.write(line)
        except FetchError as e:
            raise FetchError(e.msg.rstrip() + "Try setting --lazy.\n")
        if args.split_files:
            outfile.close()
    fasta.__exit__()
Beispiel #56
0
def count_crickMAX(args):
    """Count the number of sequences in the Crick fasta file"""
    with open(args.crick, 'r') as crick_in:
        count = 0
        for line in crick_in:
            if line.startswith('>'):
                count +=1
    return count


if __name__ == '__main__':
    args = parse_options()
    CRICK_MAX =  count_crickMAX(args)
    print "now starting Fasta import"
    seq_in = Fasta(args.seqin)
    print "done with Fasta import"
    clusters = open(args.clusters)
    outsam = args.samout


# path = '/Volumes/data/epiGBS/Baseclear/Athal/'
# path = '/Volumes/data/epiGBS/DNAVISION/Project_DNA11032___140919_SN170_0407_AC52R6ACXX/Sample_DNA11032-001-L1/output/seqykJJfz/scabiosa/'
# path = '/tmp/'
# path = '/Volumes/data/epiGBS/FINAL/Scabiosa/BASECLEAR/'
# seq_in = Fasta(path+'Scabiosa_combined.fa')
#fasta_in = SeqIO.parse(open('/tmp/test.fa', 'r'), 'fasta')
seq_in_keymap = {}
for key in seq_in.keys():
    seq_in_keymap[key.split(';')[0]] = key
    faidx_rec = seq_in[key]
Beispiel #57
0
def rename(args):
    import re
    from pyfaidx import Fasta

    fi, fo, fmf, fmb = args.fi, args.fo, args.fmf, args.fmb
    merge_short, gap = args.merge_short, args.gap
    prefix_chr, prefix_ctg = args.prefix_chr, args.prefix_ctg

    db = Fasta(fi)

    ptn1 = "^(chr)?([0-9]{1,2})"
    ptn2 = "chromosome *([0-9]{1,2})"

    sdic, cdic = dict(), dict()
    ccnt = 1
    for sid in db.keys():
        size = len(db[sid])
        res1 = re.search(ptn1, sid, re.IGNORECASE)
        if res1:
            sdic[sid] = [int(res1.group(2)), size]
        else:
            sid_long = db[sid].long_name
            res2 = re.search(ptn2, sid_long, re.IGNORECASE)
            if res2:
                sdic[sid] = [int(res2.group(1)), size]
            else:
                cdic[sid] = [ccnt, size]
                ccnt += 1

    if len(sdic.keys()) == 0:
        print("Error: no chromosomes detected")
        sys.exit(1)

    slst = sorted(sdic.items(), key = lambda t: t[1][0])
    clst = sorted(cdic.items(), key = lambda t: t[1][0])

    nchrom = slst[-1][1][0]
    sdigits = ndigit(slst[-1][1][0])
    cdigits = ndigit(clst[-1][1][0]) if len(clst) > 0 else 1
    sfmt = "%s%%0%dd" % (prefix_chr, sdigits)
    cfmt = "%s%%0%dd" % (prefix_ctg, cdigits)
    logging.debug("%d chromosomes, %d scaffolds/contigs" % (len(sdic), len(cdic)))

    fname, fext = op.splitext(fi)
    if fext not in [".%s" % x for x in FastaExt]:
        logging.error("%s is not a supported format" % fext)
        sys.exit(1)

    fho = open(fo, "w")
    fhf = open(fmf, "w")
    fhb = open(fmb, "w")
    for sid, sval in slst:
        scnt, size = sval
        nsid = sfmt % scnt
        fhf.write("%s\t%d\t%d\t+\t%s\t%d\t%d\t%d\n" % (sid, 0, size, nsid, 0, size, scnt))
        fhb.write("%s\t%d\t%d\t+\t%s\t%d\t%d\t%d\n" % (nsid, 0, size, sid, 0, size, scnt))
        nrcd = SeqRecord(Seq(str(db[sid])), id = nsid, description = '')
        SeqIO.write(nrcd, fho, "fasta")
    i = nchrom + 1
    if len(clst) > 0 and merge_short:
        zid = "%sx" % prefix_chr
        if sdigits == 2:
            zid = "%s99" % prefix_chr
        else:
            assert sdigits == 1, "wrong number of chroms: %d" % sdigits
        pos = 0
        seq = ''
        for cid, sval in clst:
            ccnt, size = sval
            start, end = pos, pos + size
            if pos > 0:
                start += gap
                end += gap
                seq += "N" * gap
            seq += str(db[cid])
            fhf.write("%s\t%d\t%d\t+\t%s\t%d\t%d\t%d\n" % (cid, 0, size, zid, start, end, i))
            fhb.write("%s\t%d\t%d\t+\t%s\t%d\t%d\t%d\n" % (zid, start, end, cid, 0, size, i))
            pos = end
            i += 1
        nrcd = SeqRecord(Seq(seq), id = zid, description = '')
        SeqIO.write(nrcd, fho, "fasta")
    else:
        for cid, sval in clst:
            ccnt, size = sval
            ncid = cfmt % ccnt
            fhf.write("%s\t%d\t%d\t+\t%s\t%d\t%d\t%d\n" % (cid, 0, size, ncid, 0, size, i))
            fhb.write("%s\t%d\t%d\t+\t%s\t%d\t%d\t%d\n" % (ncid, 0, size, cid, 0, size, i))
            nrcd = SeqRecord(Seq(str(db[cid])), id = ncid, description = '')
            SeqIO.write(nrcd, fho, "fasta")
            i += 1
    fhf.close()
    fhb.close()
    fho.close()
Beispiel #58
0
    def test_renamed(self):
        formats = (
            "refseq_full",
            "genbank_full",
            "refseq_gi",
            "genbank_gi",
            "refseq",
            "genbank",
            "chr_refseq",
            "chr_genbank",
            "chr",
        )
        for i, j in itertools.product(formats[:-1], formats):
            renamer = bioformats.seqname.NcbiFastaSeqRenamer()
            for k in self.__acc_num_files:
                renamer.read_ncbi_acc_num(k, i, j)
            # convert sequence IDs
            input_file = os.path.join(self.__test_dir, "ncbi_" + i + ".fa")
            with open(self.__output, "w") as output_fasta:
                for line in renamer.renamed(input_file):
                    output_fasta.write(line)

            example_file = os.path.join(self.__test_dir, "ncbi_" + j + ".fa")

            for k in (self.__output + ".fai", example_file + ".fai"):
                if os.path.isfile(k):
                    os.unlink(k)

            output_fasta = Fasta(self.__output)
            example_fasta = Fasta(example_file)
            # compare the obtained file to the example
            self.assertEqual(output_fasta.keys, example_fasta.keys)

        # test for an incorrect format
        with self.assertRaises(SeqRenameError):
            renamer = bioformats.seqname.NcbiFastaSeqRenamer()
            renamer.read_ncbi_acc_num("unknown", "chr_refseq", os.path.join(self.__test_dir, "ncbi_chr_refseq.fa"))
        with self.assertRaises(SeqRenameError):
            renamer.read_ncbi_acc_num("chr_refseq", "unknown", os.path.join(self.__test_dir, "ncbi_chr_refseq.fa"))

        # test for an incorrect NCBI accession number dictionary
        with self.assertRaises(IncorrectDictError):
            renamer.read_ncbi_acc_num(self.__chr_incorrect, "refseq_full", "chr_refseq")

        # check if sequence versions are removed
        renamer = bioformats.seqname.NcbiFastaSeqRenamer()
        for k in self.__acc_num_files:
            renamer.read_ncbi_acc_num(k, "chr", "genbank", remove_seq_version=True)
        input_file = os.path.join(self.__test_dir, "ncbi_chr.fa")
        example_file = os.path.join(self.__test_dir, "ncbi_genbank_nover.fa")
        with open(self.__output, "w") as output_fasta:
            for line in renamer.renamed(input_file):
                output_fasta.write(line)

        for k in (self.__output + ".fai", example_file + ".fai"):
            if os.path.isfile(k):
                os.unlink(k)

        output_fasta = Fasta(self.__output)
        example_fasta = Fasta(example_file)
        self.assertEqual(output_fasta.keys, example_fasta.keys)
        os.unlink(example_file + ".fai")

        # remove temporary files and FASTA indices
        os.unlink(self.__output)
        os.unlink(self.__output + ".fai")
        for i in formats:
            os.unlink(os.path.join(self.__test_dir, "ncbi_" + i + ".fa.fai"))
    z = defaultdict(list)  # amount of sequence passed out of Cactus
    for species in inputList:
        print species
        x[species] = 0
        y[species] = 0
        z[species] = []
        for file in os.listdir('.'):
            if file.endswith('.fai') and protId[species] in file:
                with open(file,'r') as f:
                    lines = f.readlines()
                    for line in lines:
                        if line:
                            x[species] += int(line.split('\t')[1])#abs(int(line.split('\t')[3]) - int(line.split('\t')[2]))#max(map(int,line.split('\t')[2:4]))-min(map(int,line.split('\t')[2:4]))
        for folder in [folder2 for folder2 in fastaFolders if species+'.fa' in os.listdir(folder2)]:
            try:
                fa = Fasta(folder+species+'.fa')
                #bedText = '\n'.join('\t'.join(['_'.join(line.split('_')[:-2])] + line.split('_')[-2:]) for line in fa.keys())
                y[species] += sum([len(fa[key][:].seq) for key in fa.keys()])#findlen(BedTool(bedText, from_string=True))
            except:
                print 'Error for ' + folder+species+'.fa'
            print y[species]

    """
    with open('finalSyntenyMultipleSpecies.bed','r') as f:
        print 'Bed Open...'
        lineOut = []
        for line in f.readlines():
            lineOut.append('-'.join(line.split('\t')[0:4])+'|'+line[line.rfind('\t')+1:])
        for line in lineOut:
            for seq in line.split('|'):
                y[specId[seq.split('-')[0]]] += abs(int(seq.split('-')[3]) - int(seq.split('-')[2]))
from pyfaidx import Fasta

maysFasta = Fasta('name')

maysFasta.keys()