def fasta_file(self, fasta_path, scaffold_number):
        """This method select scaffold's fasta sequence

        Args:
            fasta_path ([str]): [fasta's path]
            scaffold_number ([int]): [scaffold's number]
        """

        self.scaffold_number = scaffold_number
        self.scaffold_number = str(self.scaffold_number)
        fasta = ph.read_fasta(fasta_path)

        if 'scaffold_' + scaffold_number in fasta.id.values:

            self.dir = 'scaffold_' + scaffold_number + '_info'
            os.mkdir(self.dir)

            fasta.drop(columns={'description', 'name'}, inplace=True)
            to_save = fasta[fasta.id == 'scaffold_' +
                            self.scaffold_number].copy()
            to_save.id = '>' + to_save.id
            to_save.to_csv('./' + self.dir + '/scaffold_' +
                           self.scaffold_number + '.fasta',
                           sep='\n',
                           index=False,
                           header=False,
                           quoting=csv.QUOTE_NONE)

        else:
            print('Scaffold ' + self.scaffold_number + ' nao existente!')
            exit()
Esempio n. 2
0
    def test_to_nexus(self, path_to_dat):
        path = os.path.join(path_to_dat, 'PF08793_seed.fasta')
        df = ph.read_fasta(path)

        # Write to csv
        nexus_path = os.path.join(path_to_dat, 'test.nexus')
        df.phylo.to_nexus(alphabet='protein', filename=nexus_path)
        assert os.path.exists(nexus_path)
Esempio n. 3
0
 def test_to_fasta(self, path_to_dat):
     path = os.path.join(path_to_dat, 'PF08793_seed.fasta')
     df = ph.read_fasta(path)
     print(df.phylo)
     # Write to fasta
     fasta_path = os.path.join(path_to_dat, 'test.fasta')
     df.phylo.to_fasta(fasta_path)
     assert os.path.exists(fasta_path)
Esempio n. 4
0
    def test_to_embl(self, path_to_dat):
        path = os.path.join(path_to_dat, 'PF08793_seed.fasta')
        df = ph.read_fasta(path)

        # Write to csv
        embl_path = os.path.join(path_to_dat, 'test.embl')
        df.phylo.to_embl(alphabet='protein', filename=embl_path)
        assert os.path.exists(embl_path)
Esempio n. 5
0
    def test_to_phylip(self, path_to_dat):
        path = os.path.join(path_to_dat, 'PF08793_seed.fasta')
        df = ph.read_fasta(path)

        # Write to csv
        phylip_path = os.path.join(path_to_dat, 'test.phylip')
        df.phylo.to_phylip(phylip_path)
        assert os.path.exists(phylip_path)
Esempio n. 6
0
def test_read_fasta(path_to_dat):
    # Get path
    path = os.path.join(path_to_dat, 'PF08793_seed.fasta')
    df = ph.read_fasta(path)

    # Tests
    keys = df.keys()
    assert type(df) == ph.DataFrame
    assert 'id' in keys
    assert 'sequence' in keys
    assert 'description' in keys
Esempio n. 7
0
def test_to_fasta(path_to_dat):
    path = os.path.join(path_to_dat, 'PF08793_seed.fasta')
    df = ph.read_fasta(path)

    # Extract a single row
    row = df.iloc[0]

    # Write row to fasta
    fasta_path = os.path.join(path_to_dat, 'test.fasta')
    row.phylo.to_fasta(fasta_path)
    assert os.path.exists(fasta_path)
Esempio n. 8
0
def run(df,program="muscle",keep_tmp=False,**kwargs):

    alignment_functions = {"muscle":_align_muscle,
                           "msaprobs":_align_msaprobs}

    # Figure out which alignment function to use
    try:
        alignment_function = alignment_functions[program]
    except KeyError:
        err = "Alignment program '{}' not recognized.\n\n".format(program)
        err += "Should be one of:\n"
        programs = list(alignment_functions.keys())
        programs.sort()

        for p in programs:
            err += "    {}\n".format(p)

        raise ValueError(err)

    # make a 10-character random string for temporary files
    tmp_file_root = "".join([random.choice(string.ascii_letters) for i in range(10)])
    input_file = "{}_align_in.fasta".format(tmp_file_root)
    output_file = "{}_align_out.fasta".format(tmp_file_root)

    phy.seqio.write.to_fasta(df,id_col="uid",filename=input_file)

    alignment_function(input_file,output_file,**kwargs)

    # Parse muscle output
    new_seqs = phy.read_fasta("{}".format(output_file))
    new_seqs = pd.DataFrame({"uid":new_seqs.id,
                             "sequence":new_seqs.sequence})

    # Drop the sequence information from the original frame
    to_merge = df.copy()
    to_merge = to_merge.drop(labels=["sequence"],axis=1)

    # Merge sequence information back in, this time aligned.
    output = pd.merge(to_merge, new_seqs, on=['uid'], how='left')

    # Make sure no na got introduced
    if np.sum(pd.isnull(output.sequence)) > 0:
        err = "an unknown error caused sequences to be lost during alignment.\n"
        err += "temporary files {} and {} saved.\n".format(input_file,output_file)
        raise RuntimeError(err)

    # Nuke temporary files
    if not keep_tmp:
        os.remove(output_file)
        os.remove(input_file)

    return output
Esempio n. 9
0
def upload():
    target = os.path.join(APP_ROOT, "files/")
    print(target)

    if not os.path.isdir(target):
        os.mkdir(target)

    for file in request.files.getlist("file"):
        #print("FILE:",file)
        filename = file.filename
        destination = "/".join([target, filename])
        #print(destination)
        file.save(destination)
        #global fasta_path = destination
        global fasta_path
        fasta_path = destination
        #print(fasta_path)
        #print(type(fasta_path))
        global fasta
        fasta = pp.read_fasta(fasta_path)
        #print(type(fasta))
        #print (fasta)

    return render_template("algorithmSelectionPage.html")
Esempio n. 10
0
def read_in_seqs(in_file):
    seqs = ph.read_fasta(in_file)
    return seqs
Esempio n. 11
0
def run():
    #print(fasta_path)
    fasta = pp.read_fasta(fasta_path)
    #print(fasta)
    target = os.path.join(APP_ROOT, "files")
    listDescription = []
    enzy_filename = "RestrictionEnzymes.csv"
    destination_enz = "/".join([target, enzy_filename])
    enzymes = pd.read_csv(destination_enz)
    enzymes = enzymes[0:84]
    enzy_pattern = enzymes['Recognition Sequence']
    enzyme_patterns = []
    for i in enzy_pattern:
        if 'S' not in i and 'Y' not in i and 'R' not in i and 'N' not in i and 'V' not in i and 'W' not in i and 'B' not in i and 'D' not in i and 'H' not in i and 'K' not in i and 'M' not in i:
            #if 'B' in i:
            enzyme_patterns.append(i)
    names_of_enzymes = []

    df_enzymes = pd.DataFrame()
    for i in enzyme_patterns:
        df_enzymes = df_enzymes.append(
            enzymes.loc[enzymes['Recognition Sequence'] == i],
            ignore_index=True)
    names_of_enzymes = list(df_enzymes['Enzymes'])
    #df_enzymes.to_csv("Enzymesused.csv")
    #print(names_of_enzymes)
    i = 0
    while i < 3:
        k = 0
        if i == 0:
            descriptor = "/numCuts"
        if i == 1:
            descriptor = "/RMS"
        if i == 2:
            descriptor = "/avgMW"
        while k < len(names_of_enzymes):
            enzyme = names_of_enzymes[k]
            #print(enzyme)

            stringDescription = enzyme + descriptor
            listDescription.append(stringDescription)
            k += 1
        i += 1
    #print(len(listDescription))

    j = 0
    #fasta = phylopandas.read_fasta('fasta.fasta')
    seq_filename = fasta_path
    destination_seq = "/".join([target, seq_filename])
    #fasta = pd.read_csv(destination_seq)
    #fasta = df
    #fasta.drop_duplicates('sequence',keep="first")
    #print(len(fasta))
    c = 0

    #for i in fasta['sequence']:
    testdf = pd.read_csv("pink.csv")
    for seq in fasta['sequence']:
        seq = seq.upper()
        num_of_cuts = []
        listOfRMS = []
        temp_cutLength_list = []
        listAvgMoMa = []
        tempFragmentMoMa_list = []
        tempFragmentMoMa = 0
        #inputfeature =[]
        #sequence = fasta.loc[i,'sequence'
        for j in enzyme_patterns:
            num_of_cuts.append(len(seq.split(j)))
            split1 = seq.split(j)
            for k in range(len(split1)):
                temp_cutLength = len(split1[k])
                temp_cutLength_list.append(temp_cutLength)
                tempFragment = split1[k]
                for u in range(len(tempFragment)):
                    if tempFragment[u] == 'A':
                        tempFragmentMoMa = tempFragmentMoMa + 507
                    if tempFragment[u] == 'C':
                        tempFragmentMoMa = tempFragmentMoMa + 483
                    if tempFragment[u] == 'G':
                        tempFragmentMoMa = tempFragmentMoMa + 523
                    if tempFragment[u] == 'T':
                        tempFragmentMoMa = tempFragmentMoMa + 483
                    tempFragmentMoMa_list.append(tempFragmentMoMa)
            rms = (sqrt(mean(square(temp_cutLength_list))))
            listOfRMS.append(rms)
            mean_moma = mean(tempFragmentMoMa_list)
            listAvgMoMa.append(num_of_cuts)
    print(len(num_of_cuts), len(listOfRMS), len(listAvgMoMa))
    cuts = pd.DataFrame([num_of_cuts])
    RMSdf = pd.DataFrame([listOfRMS])
    MOMadf = pd.DataFrame([listAvgMoMa])
    inputfeature = pd.concat([cuts, RMSdf, MOMadf], axis=1)
    #inputcsv=inputfeature.to_csv(header=stringDescription)
    print(type(inputfeature))
    print(inputfeature.shape)
    print(inputfeature.head)
    #print(inputcsv)
    #testinput= pd.read_csv(inputcsv)
    #featureinput = pd.DataFrame(listofinput)
    #print(len(featureinput))
    #print(testinput)
    #print(pd.testinput.shape)

    return render_template("complete.html", f1=listAvgMoMa)
Esempio n. 12
0
def run(df,cutoff=0.9,keep_tmp=False):
    """
    Run cd-hit on a phylopandas data frame to remove similar sequences.
    Sequences are sorted from longest to shortest before running cd-hit,
    ensuring that the larger sequences always end up in the final output.

    df: phylopandas data frame to de-duplicate.
    cutoff: percent sequence identity cutoff (between 0 and 1.0) at which to
            place two sequences in the same cluster
    keep_tmp: keep temporary cd-hit files.
    """

    # Sanity check, convert to string for later
    if cutoff < 0 or cutoff > 1.0:
        err = "cutoff must be a float between 0 and 1.0 (inclusive)\n"
        raise ValueError(err)
    cutoff = "{:.3f}".format(cutoff)

    # Make a temporary data frame
    tmp_df = df.copy()

    # Sort the data frame so the longest sequence occurs first
    tmp_df["seq_length"] = [len(s) for s in tmp_df.sequence]
    tmp_df = tmp_df.sort_values(by=["seq_length"],ascending=False)

    # make a 10-character random string for temporary files
    tmp_file_root = "".join([random.choice(string.ascii_letters) for i in range(10)])
    input_file = "{}_cd-hit_in.fasta".format(tmp_file_root)
    out_root = "{}_cd-hit_out.fasta".format(tmp_file_root)

    # Write data frame to fasta file
    phy.seqio.write.to_fasta(tmp_df,id_col="uid",filename=input_file)

    # Construct cd-hit command
    cmd = ['cd-hit', "-i", input_file, "-o", out_root,"-c",cutoff]


    # Run cd-hit
    try:
        run = subprocess.Popen(cmd,
                               stdout=subprocess.PIPE,
                               stderr=subprocess.STDOUT)
        stdoutdata, stderrdata = run.communicate()
    except FileNotFoundError:
        err = "cd-hit does not appear to be in your path\n"
        raise RuntimeError(err)

    # Make sure it returned successfully
    if run.returncode != 0:
        err = "cd-hit failed\n"
        raise RuntimeError(err)

    # Parse cd-hit output
    new_seqs = phy.read_fasta("{}".format(out_root))

    # Remove temporary files
    if not keep_tmp:
        os.remove(out_root)
        os.remove("{}.clstr".format(out_root))
        os.remove(input_file)

    # Only grab sequences with uid that made it through cdhit
    return df[df.uid.isin(list(new_seqs.id))].copy()
    def open_files(self, fasta_path, genome_path):
        self.genome_fasta = ph.read_fasta(fasta_path)
        self.genome_fasta.drop(columns=['label', 'uid', 'description'], inplace=True)

        self.genes_coordinate = pd.read_csv(genome_path, sep='\t',header=None, comment='#')
        self.genes_coordinate = self.genes_coordinate.rename(columns={0:'seqId_gene', 1:'source_gene', 2:'type_gene', 3:'start_gene', 4:'end_gene', 5:'score_gene', 6:'strand_gene', 7:'phase_gene', 8:'attributes_gene'})
Esempio n. 14
0
import phylopandas as ph
import string
import pandas as pd
df1 = ph.read_fasta('uniprot-membrane .fasta')
df2 = ph.read_fasta('Uniq_MP.fasta')
#print(df1.columns.values.tolist())   # 列名称
#print(df1.shape[0])
#print(df2.columns.values.tolist())   # 列名称
#print(df2.shape[0])
count = 1
f = open('01_2000pro.fasta', 'w')
for i in range(0, len(df1)):
    for j in range(0, len(df2)):
        str = df1.loc[i]['id'].split('|')[1]
        if (str == df2.loc[j]['id']):
            break
    else:
        f.write('>' + str + '\n')
        f.write(df1.loc[i]['sequence'] + '\n')
        count = count + 1
    #如果对比不同,则写入文档
    if (count > 2000):
        break
Esempio n. 15
0
def run(df,
        sequence_type="protein",
        min_conserved=None,
        min_flank=None,
        max_contig_noncon=8,
        min_block_length=10,
        allowed_gap="no",
        use_sim_matrix=True,
        min_initial_block=None,
        keep_tmp=False):
    """

    Run Gblocks on a phylopandas dataframe.

    Full docs for software are here:
    http://molevol.cmima.csic.es/castresana/Gblocks/Gblocks_documentation.html#Installation

    As of this writing (August 2019), Gblocks can be installed using
    bioconda (https://bioconda.github.io/).

    sequence_type: type of sequence
                   default: protein
                   allowed: ("protein","dna","codon")
                   gblock flag: -t
    min_conserved: minimum number of sequences for conserved position
                   default: num_seqs*0.5 + 1
                   allowed: (num_seqs*0.5 + 1,num_seqs)
                   gblock flag: -b1
    min_flank: minimum number of sequences for a flank position
                   default: 0.85*num_seqs
                   allowed: (min_conserved,num_seqs)
                   gblock flag: -b2
    max_contig_noncon: maximum number of contiguous non-conserved positions
                   default: 8
                   allowed: any integer > 0
                   gblock flag: -b3
    min_block_length: minimum length of a block
                   default: 10
                   allowed: any integer > 2
                   gblock flag: -b4
    allowed_gap: allowed gap positions
                   default: "no"
                   allowed: ("no","half","all")
                   gblock flag: -b5
    use_sim_matrix: use similarity matrix
                   default: True
                   allowed: True,False (True only allowed for proteins)
                   gblock flag: -b6
    min_initial_block: minimum length of an initial block
                   default: min_block_length
                   allowed: any integer > 2
                   gblock flag: -b0
    keep_tmp: keep temporary files and write out reports
                   default: False
                   allowed: False,True
    """

    tmp_file_root = "".join(
        [random.choice(string.ascii_letters) for i in range(5)])

    input_file = "{}.fasta".format(tmp_file_root)
    output_file = "{}.fasta-gb".format(tmp_file_root)
    output_summary = "{}.fasta-gb.htm".format(tmp_file_root)
    gblocks_stdout = "{}.stdout".format(tmp_file_root)
    gblocks_stderr = "{}.stderr".format(tmp_file_root)

    phy.seqio.write.to_fasta(df, id_col="uid", filename=input_file)

    cmd = ["Gblocks", input_file]

    num_seqs = len(df.uid)

    # Parse sequence type
    a = _qual_arg(sequence_type, "sequence_type", "t", {
        "protein": "p",
        "dna": "d",
        "codon": "c"
    })
    cmd.append(a)

    # Parse min conserved
    min_allowed = int(math.ceil(0.5 * num_seqs) + 1)
    max_allowed = num_seqs
    if min_conserved is None:
        min_conserved = min_allowed
    a = _quant_arg(min_conserved, "min_conserved", "b1", min_allowed,
                   max_allowed)
    cmd.append(a)

    # Parse min flank
    min_allowed = min_conserved
    max_allowed = sys.maxsize
    if min_flank is None:
        min_flank = int(math.ceil(0.85 * num_seqs))
    a = _quant_arg(min_flank, "min_flank", "b2", min_allowed, max_allowed)
    cmd.append(a)

    # Parse max_contig_noncon
    a = _quant_arg(max_contig_noncon, "max_contig_noncon", "b3", 1,
                   sys.maxsize)
    cmd.append(a)

    # Parse min_block_length
    a = _quant_arg(min_block_length, "min_block_length", "b4", 2, sys.maxsize)
    cmd.append(a)

    # Parse allowed_gap
    a = _qual_arg(allowed_gap, "allowed_gap", "b5", {
        "no": "n",
        "half": "h",
        "all": "a"
    })
    cmd.append(a)

    # parse use_sim_matrix
    use_sim_matrix = bool(use_sim_matrix)
    if use_sim_matrix and sequence_type != "protein":
        use_sim_matrix = False
    a = _qual_arg(use_sim_matrix, "use_sim_matrix", "b6", {
        True: "y",
        False: "n"
    })
    cmd.append(a)

    # parse min_initial_block
    if min_initial_block is None:
        min_initial_block = min_block_length
    a = _quant_arg(min_initial_block, "min_initial_block", "b0", 2,
                   sys.maxsize)
    cmd.append(a)

    if keep_tmp:
        cmd.append("-d=y")
    else:
        cmd.append("-d=n")

    # Run gblocks
    stdout = open(gblocks_stdout, "w")
    stderr = open(gblocks_stderr, "w")

    try:
        output = subprocess.run(cmd, stdout=stdout, stderr=stderr)
    except FileNotFoundError:
        err = "Gblocks does not appear to be in your path\n"
        raise RuntimeError(err)

    stdout.close()
    stderr.close()

    # As far as I can tell, Gblocks *always* returns 1.  At least on my
    # MacBook (Mojave 10.14.6, bioconda build.)
    if output.returncode != 1:

        err = "Gblocks failed for some reason\n"

        err += "\n\n------stdout-----\n\n"
        err += "".join(open(gblocks_stdout).read())
        err += "\n\n------stderr-----\n\n"
        err += "".join(open(gblocks_stderr).read())

        raise RuntimeError(err)

    # Parse output
    new_seqs = phy.read_fasta(output_file)
    new_seqs = pd.DataFrame({
        "uid": new_seqs.id,
        "sequence": new_seqs.sequence
    })

    num_old_columns = len(df.loc[0].sequence)
    num_new_columns = len(new_seqs.loc[0].sequence)
    if num_new_columns == 0:
        err = "\n\nGblocks removed all columns from alignment.\n"
        err += "Trying building a better alignment, editing it manually,\n"
        err += "or altering one or more Gblocks settings to be less aggressive.\n"
        err += "See help string on this function for details.\n\n"

        raise RuntimeError(err)

    print("Gblocks reduced the number of columns from {} to {}.\n".format(
        num_old_columns, num_new_columns))

    # Drop the sequence information from the original frame
    to_merge = df.copy()
    to_merge = to_merge.drop(labels=["sequence"], axis=1)

    # Merge sequence information back in, this time aligned.
    output = pd.merge(to_merge, new_seqs, on=['uid'], how='left')

    # Make sure no na got introduced
    if np.sum(pd.isnull(output.sequence)) > 0:
        err = "an unknown error caused sequences to be lost during alignment.\n"
        err += "temporary files {} and {} saved.\n".format(
            input_file, output_file)
        raise RuntimeError(err)

    # Remove temporary files
    if not keep_tmp:
        os.remove(input_file)
        os.remove(output_file)
        os.remove(output_summary)
        os.remove(gblocks_stdout)
        os.remove(gblocks_stderr)

    return output
Esempio n. 16
0
import phylopandas as ph
import string
from Bio import SeqIO
import pandas as pd
import numpy as np
df1 = ph.read_fasta('1568122732.fas.db2novel')
f = open('Negative_Samples.fasta', 'w')
print(df1.shape[0])
#list=random.sample([i for i in range(len(df1))],10)
list = np.random.choice(range(len(df1)), 271, replace=False)  #随机选取271序号
#print(list)
for i in range(len(list)):
    f.write('>' + df1.loc[list[i]]['id'] + '\n')
    f.write(df1.loc[list[i]]['sequence'] + '\n')