def generate_files4blast_xen(dict_in, fasta_in, path_for_files):
    """
    Inputs:
        dict_in - the dictionary where the keys are the xenopus reference names that have phosphrylated residue
        fasta_in - the fasta file that has the xenopus reference information
        path_for_files
    Outputs:
        Nothing is returned
    """
    # read the fasta file into a dictionary
    fasta_dict = {}
    with open(fasta_in) as fasta_file:
        parser = fastaparser.Reader(fasta_file)
        for seq in parser:
        # seq is a FastaSequence object
            fasta_dict[seq.id] = seq.sequence_as_string()

    for key in dict_in.keys():
        if key in fasta_dict.keys():
        # generate the file
        #file_name_1 = row[col_ref] + ".fa"
            file_name = pathlib.Path(path_for_files, key + ".fa")
        # use fastaparser to write the single entry
            with open(file_name, 'w') as fasta_file:
                writer = fastaparser.Writer(fasta_file)
                writer.writefasta((key,fasta_dict[key]))
    def read_fasta_file(self, fasta_file):
        with open(fasta_file) as fasta_file:
            parser = fastaparser.Reader(fasta_file)
            for seq in parser:
                sequence = seq.sequence_as_string()

        return sequence
def open_genome_file():
    dna_string = ""
    with open(genome_directory + genome_file_name) as fasta_file:
        parser = fastaparser.Reader(fasta_file)
        for seq in parser:
            dna_string += seq.sequence_as_string()
    return dna_string
    def test_design_crispr_grnas(
        self,
        discover_client: DISCOVERClient,
    ):
        # Fasta file
        seq_filepath = get_project_root(
        ) / 'teselagen/examples/pytested/dummy_organism.fasta'

        # Load file
        with open(seq_filepath) as fasta_file:
            parser = fastaparser.Reader(fasta_file)
            for seq in parser:
                fasta_seq: str = cast(FastaSequence, seq).sequence_as_string()
                break

        # Call method to be tested
        res = discover_client.design_crispr_grnas(
            sequence=fasta_seq,
            target_indexes=(500, 600),
        )

        assert isinstance(res, dict)
        assert 'guides' in res
        assert 'target_indexes' in res
        assert len(res['guides']) == 7
Exemple #5
0
def generate_sequences(path, chr_=''):
    import fastaparser
    if chr_ != '':
        with open("data/genomes/hg19_hard_50.fa") as fasta_file:
            parser = fastaparser.Reader(fasta_file, parse_method='quick')
            for seq in parser:
                if seq.header == chr_:
                    print(seq.header)
                    main_funct(path, seq.sequence)
    else:
        main_funct(path)
def fasta_for(path_to_zipfile):
    '''
    Return fasta parser for RefSeq CDS.
    path_to_zipfile: The relative path to the zipfile containing the virus data report
    '''
    fasta = {}
    with zipfile.ZipFile(path_to_zipfile, 'r') as zip:
        with io.TextIOWrapper(zip.open('ncbi_dataset/data/cds.fna')) as fh:
            for seq in fastaparser.Reader(fh):
                fasta[seq.id] = seq.sequence_as_string()

    return fasta
Exemple #7
0
def fasta_to_json(fasta_path: Path) -> None:
    benchmark_dct = {}
    with fasta_path.open() as fasta_file:
        parser = fastaparser.Reader(fasta_file, parse_method="quick")
        for sequence in parser:
            protein_id = sequence.header.split("|")[1]
            sequence_str = sequence.sequence
            benchmark_dct[protein_id] = sequence_str

    benchmark_dct = {"inputs": benchmark_dct}

    with BENCHMARK_FILEPATH.open("w") as fp:
        json.dump(benchmark_dct, fp)
Exemple #8
0
def fasta_parser(path):
    fasta_df = pd.DataFrame()
    print("Done")
    with gzip.open(path, 'rt') as fasta_file:
        reader = fastaparser.Reader(fasta_file)
        print("File loaded")
        fasta_df['ID'] = [seq.id for seq in reader]
        print("Column 1 of 3 added")
        fasta_df['Description'] = [seq.description for seq in reader]
        print("Column 2 of 3 added")
        fasta_df['Sequence'] = [seq.sequence_as_string() for seq in reader]
        print("Column 3 of 3 added")

    return fasta_df
def filter_fasta(fasta_in, rem_str, fasta_out):
    """
    INPUTS:
    fasta_in - name with path from current folder for the input fasta file
    rem_str - the regular expression string to find what we want to remove from sequence ids
    fasta_out - name with path from curret folder for the output fasta file

    OUTPUTS:
    fasta_dict - the dictionary of non-isofrm fasta sequences to use to defined
        the human motif
    ** fasta_out is written
    """

    # create the dictionary for the non-isoform references in the input
    #   human references that we are going to store
    fasta_dict = {}

    # read in the sequences from the fasta file with fastaparser
    with open(fasta_in) as fasta_file:
        parser = fastaparser.Reader(fasta_file)
        # seq is a FastaSequence object
        for seq in parser:
            # check to see if the sequence id includes "iso"
            #   if it does then don't add it to the fasta_dict
            if(re.search(rem_str,seq.id)):
                pass
            #   if it doesn't have iso then do add it to eh dictionary
            else:
                fasta_dict[seq.id] = seq.sequence_as_string()

    # now right the fasta sequences stored in the dictionary to a text file
    with open(fasta_out, 'w') as fasta_file:
        writer = fastaparser.Writer(fasta_file)
        for seq_id in fasta_dict.keys():
            writer.writefasta((seq_id, fasta_dict[seq_id]))

    # return the dictionary of non-isoform fast sequences so that the human
    #   motifs can be found 
    return fasta_dict
def winnow_fasta(fasta_fp, base_empress_df, out_stringent_fp):
    with open(out_stringent_fp, 'w') as stringent_file:
        with open(fasta_fp) as fasta_file:
            parser = fastaparser.Reader(fasta_file)
            for seq in parser:
                # match header to consensus_seq_name column of metadata
                # and get the usable_for value for it
                seq_metadata_df = base_empress_df.loc[
                    base_empress_df[CONS_NAME] == seq.id]

                if len(seq_metadata_df) == 0:
                    continue
                elif len(seq_metadata_df) > 1:
                    raise ValueError(f"More than one metadata row with"
                                     f"consensus sequence name "
                                     f"'{seq.id}' found")
                else:
                    seq_stringent_test_val = \
                        seq_metadata_df.loc[:, STRINGENT_TEST_COL].iat[0]
                    fasta_str = seq.formatted_fasta() + '\n'
                    if seq_stringent_test_val == STRINGENT_INCLUDE_VAL:
                        stringent_file.write(fasta_str)
Exemple #11
0
#!/usr/bin/env python3
#https://github.com/mor16fsu/bch5884
#Mitchell Roth

from DNALib import *
import fastaparser
from matplotlib import pyplot as plt

print("")

print("The DNA input sequence is:")

#Parse FASTA file

with open("mEGFP2.fa") as fasta_file:
    parser = fastaparser.Reader(fasta_file, parse_method='quick')
    for seq in parser:
        print(seq.sequence)
        print()

print("")
print("If error in sequence print False, if no errors print None")
print(checkseq(seq.sequence))

print("")
print("The frequency of each nucleotide is:")
print(countnuc(seq.sequence))

print("")
print("The transcript of the sequence is:")
print(transcribe(seq.sequence))
Exemple #12
0

def memGraph(seqeunce, N):
    memFuncArray = memFunction(seqeunce, N)
    x = []
    for i in range(1, N + 1):
        x.append(i)
    plt.plot(x, memFuncArray)
    plt.xlabel("r")
    plt.ylabel("F")
    plt.show()


root = tk.Tk()
root.withdraw()
filepath = filedialog.askopenfilename()
with open(filepath) as fasta_file:
    reader = fp.Reader(fasta_file)
    for seq in reader:
        #print ('ID: ', seq.id)
        #print ('Description:', seq.description)
        #print ('Sequence: ', seq.sequence_as_string())
        sequence = seq.sequence_as_string()
        #print (len(sequence))
        #corGraph("A", "G", 20, seq.sequence_as_string())
        #dispGraph(len(sequence), subsum(sequenceToBinary("A", "G", "", sequence)))
        corGraph("1", "1", 40, sequenceToBinary("A", "G", "", sequence))
        #memGraph(sequenceToBinary("A", "G", "", sequence), 60)
        print("\n")
    fasta_file.close()
Exemple #13
0
import fastaparser
import sys

seq_dict={}

with open(sys.argv[1]) as fasta_file:
    parser=fastaparser.Reader(fasta_file)
    for seq in parser:
        seq_dict[seq.id]=seq.sequence_as_string()

pairs=[]
for s in seq_dict.keys():
    suffix=seq_dict[s][-3:]
    for k in seq_dict.keys():
        if s==k:
            continue
        prefix=seq_dict[k][:3]
        if suffix==prefix:
            pairs.append([s,k])

for pair in pairs:
    print(pair[0]+" "+pair[1])