Example #1
0
#converts FASTQ files from NCBI to FASTA files
#can take a file name or list of file names as an input
#Last updated 2015.8.10

import sys            #for helping with command line interface
import dante          #main dante functions
import traceback      #for helping with errors


try: 

    for file_name in dante.makeFileList(sys.argv):
        file_name_new = dante.makeNewFileName ("Desktop/Output", file_name, ".fasta")
        dante.printOutput(file_name_new)
        with open(file_name,'r') as f, open (file_name_new, 'w') as g:

            #goes through each line in f
            trigger = False
            for line in f:
                print line
                if line.isspace() == False:
                    #write the next line if a header line occured previously   
                    if trigger:
                        newFile.write(line)
                    #if "@" header line, write to file
                    if line[0] == "@":
                        #change header information to fasta format
                        #(send everything but the @ sign to spaceToBar
                        g.write(">" +line[1:])
                        #set up so that the next line will automatically be written
                        trigger = True
Example #2
0
try:

    max_name_length = str(raw_input('What is the maximum name length allowed?:' ))  #sets the maximum length of a name

    #Decide if file names are present as arguments or if the script needs to ask.    
    if len(sys.argv) > 2:  #the first argument in sys.argv is always the script name
        file_names = sys.argv[1:]
        #run the changeToFasta command on every file name if other commands are given
        tre_file = sys.argv[1]
        names_file = sys.argv[2]
    else:
        tre_file = str(raw_input('Enter the tree file to be changed: '))
        names_file = str(raw_input('Enter the names file: '))

    file_name_new = dante.makeNewFileName ("Desktop/Output", tre_file, ".namesfixed.tre")
    dante.printOutput(file_name_new)
    
    names_dict ={} #Key = short name. Value = long name.

    current_key = ""

    #Goes through every line of the names file. 
    #Create a list of short names as well as a dictionary using the short name as the key and the full name as the value.
    with open (names_file, 'r') as f:
        key_switch = True
        for line in f:
            if key_switch:
                key_switch = False
                current_key = line.strip() #Creates a key to use in a dictionary. Removes newline from key.
            else:
Example #3
0
#FASTA to PHYLIP + NAMES
#This program simplifies names to 10 characters and creates a cross reference file.
#It then returns a PHYLIP sequential formatted sequence file.  
#This program allows multiple file names to be called as arguments.

#Last updated: 2015.8.12

import sys            #for helping with command line interface
import dante          #main dante functions
import traceback      #for helping with errors

try:
    
    for file_name in dante.makeFileList(sys.argv):
        file_clean = dante.fastaClean(file_name)
        file_name_phylip = dante.makeNewFileName ("Desktop/Output", file_clean, ".phylips")
        file_name_names = dante.makeNewFileName ("Desktop/Output", file_clean, ".dnames")


        with open(file_clean,'r') as f: 
            
            #NEED TO GO THROUGH ALL THE LINES OF THE REFERENCE FILE AND WRITE THE HEADER LINE TO A NEW LIST
            
            #create phylip header (count number of sequences and the number of characters in first sequence)
            sequence_number = 0
            character_number = 0
            
            for line in f:
                if line [0] == ">":
                    sequence_number = sequence_number + 1
                else:
Example #4
0
    #Get search information from user
    rep_number = dante.getNumber("How many representatives do you want?")
    dante.log("How many representatives do you want?")
    dante.log(rep_number)
    database = str(raw_input("Which database do you want to search?"))
    dante.log("Which database do you want to search?")
    blast_db_format = 5  #exports hits as XML
    dante.log("Database format")
    dante.log(blast_db_format)

    #allows multiple inputs at command line or will ask for an input file
    #gets a list of fasta files
    for filename in dante.makeFileList(sys.argv):
        dante.log("Input Filename")
        dante.log(filename)
        file_name_xml = dante.makeNewFileName("Desktop/Output", filename, str(".BLAST." + database + ".top" + str(rep_number) + "hits.xml"))
        dante.log("Filename of XML file")
        dante.log(file_name_xml)
        file_name_summary = dante.makeNewFileName("Desktop/Output", file_name_xml,".summary.tsv")
        dante.log("Filename of Summary file")
        dante.log(file_name_summary)
        blastn_cline = NcbiblastnCommandline(remote=True, query=filename, db=database, outfmt=blast_db_format, out= file_name_xml, max_target_seqs =rep_number)

        dante.log(blastn_cline)

        stdout, stderr = blastn_cline()

        dante.BLASTSummary(file_name_xml, file_name_summary)

                        
        print "_____Summary_____"
Example #5
0
#REQUIRES BIOPYTHON TO BE INSTALLED
#REQUIRES DANTE MODULE

import sys            #for helping with command line interface
import dante          #main dante functions
import traceback      #for helping with errors

try:
    dante.log("Running BLAST_Summary.py")

    #allows multiple inputs at command line or will ask for an input file
    #gets a list of fasta files
    for filename in dante.makeFileList(sys.argv):
        file_name_new = dante.makeNewFileName ("Desktop/Output", filename,".summary.tsv")
        dante.log(str("Original File Name: " + filename))
        dante.log(str("New File Name: " + file_name_new))
        dante.BLASTSummary(filename, file_name_new)
        print "_____Summary_____"
        print "For filename: ", filename
        print "Output file: ", file_name_new 
    
except:
    traceback.print_exc(file=sys.stdout)
    dante.log(traceback.format_exc())
    exit(0)
Example #6
0
    print ("NCBI databse requires an e-mail address. ")
    Entrez.email = dante.getEmail()
    #allows multiple inputs at command line or will ask for an input file
    for filename in dante.makeFileList(sys.argv):

        name_set = set()
        name_list=[]
        
        result_handle = open(filename,'r') #open the xml file for reading
        blast_records = NCBIXML.parse(result_handle) #parses the file to a blast_records object
        total = 0
        for record in blast_records: #go through every record generated
            for alignment in record.alignments:
                name_set.add(alignment.title.split('|')[1]) #adds id number to set (removes duplicates)

        new_file = dante.makeNewFileName ('Desktop/Output', filename, "seqs.fasta")
        
        with open(new_file,'w') as f:
            for value in name_set: #walks through every id number
                #Biopython for retreving fasta files
                handle = Entrez.efetch(db="nucleotide", id=value, rettype="fasta", retmode="text")
                f.write(handle.read())

        dante.log("Program Ran: BlastXMLtoFasta.py")
        dante.log("Input file: " + filename)
        dante.log("Output file:" + new_file)

except:
    traceback.print_exc(file=sys.stdout)
    exit(0)
Example #7
0
        second_file_name = sys.argv[2]
    else:
        first_file_name = str(raw_input('Enter the file to be changed: '))
        second_file_name = str(raw_input('Enter the reference file that contains the correct header names: '))

    
    
    #NEED TO GO THROUGH ALL THE LINES OF THE REFERENCE FILE AND WRITE THE HEADER LINE TO A NEW LIST
    headerDict = {}
    with open (first_file_name, 'r') as f:
        for line in f:
            if line[0] == ">":
                headerDict[line.split('|')[1]] = line

    #open file using user supplied name
    new_file = dante.makeNewFileName ('Desktop/Output', second_file_name, ".namesfixed.fasta")
    with open(second_file_name,'r') as f, open(new_file,'w') as g:
        print "\n"
        print ("Output File Name: %s") %(new_file)
        print "\n"
        for line in f:
            #if the line is a new header line, write instead the top of the list
            flag = True
            if line[0] == ">":  #Header lines in fasta format all start with '>'
                test_id = line.split('|')[1] #This only works on NCBI formatted files
                for key in headerDict:
                    if str(test_id) == str(key):  #tests id against key
                        g.write(headerDict[key])
                        flag = False
                if flag:
                    g.write(line)