Ejemplo n.º 1
0
def Lex():
    """ Given a collection of symbols and a +integer n, returns
    all strings of length n that can be formed from the alphabet,
    ordered lexicographically"""
    input = f.LoadFile('\\rosalind_lexf.txt').splitlines()
    sym = input[0].split()
    r = int(input[1])

    n = len(sym)
    lex_list = []

    # for first round
    for _ in range((n**r) // n):  # should match 1/n of total in list
        for i in sym:
            lex_list.append(i)
    lex_list = sorted(lex_list)

    # First go through whole list correct number of times (j)
    m = (n**r) // n
    k = 0
    for j in range(1, r):
        i = -1  # index resets each go-through
        m = m // n  # Smaller number of each sym in a row each round
        k += 1  # More cycles each time to get through list

        for __ in range(n**k):
            for s in sym:
                for _ in range(m):
                    i += 1
                    lex_list[i] += s

    f.ExportToFile('rosalind_lexf_output.txt', '\n'.join(lex_list))
    return
Ejemplo n.º 2
0
def Founder():
    """ Returns matrix representing log(prob) that after i
    generations, no copies of the recessive allele will 
    remain in the population """
    input = f.LoadFile('\\rosalind_foun.txt').splitlines()
    [N, m] = [int(x) for x in input[0].split()]
    A = [int(x) for x in input[1].split()]

    # initialize matrix
    B = []
    for i in range(m):  #possible generations
        B.append([])
        for j in range(len(A)):  #possible initial copies
            B[i].append(0)

    # Add DriftToNone to correct box
    for i in range(1, m + 1):
        for j in range(len(A)):
            B[i - 1][j] = str(log10(DriftToNone(N, i, A[j])))

    # print in format
    B_print = []
    for line in B:
        B_print.append(' '.join(line))
    f.ExportToFile('rosalind_foun_output.txt', '\n'.join(B_print))
    return
Ejemplo n.º 3
0
def OverlapGraph():
    """ Returns adjacency list of labels of DNA in FASTA format"""
    input = f.LoadFile('\\rosalind_grph.txt')
    [Labels, DNA] = f.FASTA(input)

    temp_dict = {}
    adj_dict = {}
    for kmer in DNA:
        temp_dict[kmer] = []
    for kmer in temp_dict:
        for i in DNA:
            if (kmer[-3:] == i[:3]  # if overlap by 3
                    and kmer != i):  # don't include self!
                temp_dict[kmer].append(i)
        # Remove any without matches
        if temp_dict[kmer] != []:
            adj_dict[kmer] = temp_dict[kmer]

    # Replace with labels
    name_dict = {}
    for kmer in adj_dict:
        kmer_ind = DNA.index(kmer)
        val_inds = []
        for value in adj_dict[kmer]:
            val_inds.append(DNA.index(value))
        name_dict[Labels[kmer_ind]] = [Labels[i] for i in val_inds]

    # Return in format
    output = []
    for name in name_dict:
        for i in name_dict[name]:
            output.append(' '.join([name, i]))
    f.ExportToFile('rosalind_grph_output.txt', '\n'.join(output))
    return
Ejemplo n.º 4
0
def NewickDistanceWeights():
    """ Gives distances between pair of nodes in trees (Newick) """
    input = f.LoadFile('\\rosalind_nkew.txt').splitlines()

    # Separate into Trees and Pairs
    Trees = []
    Pairs = []
    for line in input:
        if ';' in line:
            Trees.append(line)
        elif line != '':
            Pairs.append(line.split())

    # For each tree in the file
    distances = []
    for i in range(len(Trees)):
        tree = Phylo.read(io.StringIO(Trees[i]), 'newick')
        # If no edgeweights specified, use code below (weight=1)
        """clades = tree.find_clades()
        for clade in clades:
            clade.branch_length = 1"""

        d = tree.distance(Pairs[i][0], Pairs[i][1])
        distances.append(str(d))

    f.ExportToFile('rosalind_nkew_output.txt', ' '.join(distances))
    return
Ejemplo n.º 5
0
def TT():
    """ Given 2 DNA strings of equal length, FASTA format,
    returns the transition / transversion ratio
    Transitions: A<->G, C<->T
    Transversions: A<->T, A<->C, G<->C, G<->T """
    input = f.LoadFile('\\rosalind_tran.txt')
    [Labels, DNA] = f.FASTA(input)
    p = DNA[0]
    q = DNA[1]

    transition = 0
    transversion = 0
    for i in range(len(p)):
        if p[i] == q[i]:
            continue
        else:
            if (p[i] in 'AG') and (q[i] in 'AG'):
                transition += 1
            elif (p[i] in 'CT') and (q[i] in 'CT'):
                transition += 1
            else:
                transversion += 1
    ratio = str(transition / transversion)
    f.ExportToFile('rosalind_tran_output.txt', ratio)
    return
Ejemplo n.º 6
0
def CompareSpectra():
    """ Given 2 spectra, returns:
    1. the largest mutliplicity of set1(-)set2
    2. abs(x) which maximizes (set1(-)set2)(x) """
    input = f.LoadFile('\\rosalind_conv.txt').splitlines()
    temp_spec = [float(x) for x in input[0].split()]
    test_spec = [float(x) for x in input[1].split()]

    # Find all possible differences between spectra
    differences = []
    for i in temp_spec:
        for j in test_spec:
            differences.append(round(i - j, 5))

    # Find diff that occurs most frequently
    mode = max(set(differences), key=differences.count)

    # Count how frequently
    count = 0
    for i in differences:
        if i == mode:
            count += 1

    #print(count, mode, sep = '\n')
    f.ExportToFile('rosalind_conv_output.txt',
                   '\n'.join([str(count), str(mode)]))
    return
Ejemplo n.º 7
0
def Sets():
    """ Returns 6 sets:
    1. A U B
    2. A intersection B
    3. A - B
    4. B - A
    5. Ac
    6. Bc """
    input = f.LoadFile('\\rosalind_seto.txt').splitlines()
    n = int(input[0])
    A = input[1].replace('{', '').replace('}', '').split(', ')
    B = input[2].replace('{', '').replace('}', '').split(', ')

    # Make Union set
    AB_union = RemoveDuplicates(A + B)  # either A or B (or both)
    AB_intersect = [i for i in A if i in B]  #both A & B
    AB_diff = [i for i in A if i not in B]  # A not B
    BA_diff = [i for i in B if i not in A]  # B not A

    U = [str(i) for i in range(1, n + 1)]  # for set complements
    A_comp = [i for i in U if i not in A]  # U not A
    B_comp = [i for i in U if i not in B]  # U not B

    # Return in format
    Sets = [AB_union, AB_intersect, AB_diff, BA_diff, A_comp, B_comp]
    output = []
    for set in Sets:
        output.append('{%s}\n' % ', '.join(set))
    f.ExportToFile('rosalind_seto_output.txt', ''.join(output))
    return
Ejemplo n.º 8
0
def MatchSpectrum():
    """ Given:
    1) A positive integer n
    2) n protein strings
    3) A multiset corresponding to the complete spectrum of some
    unknown protein string...
    ... Returns the maximum multiplicity, and the string where this occurs """
    input = f.LoadFile('\\rosalind_prsm.txt').splitlines()
    n = int(input[0])
    proteins = input[1:n+1]
    spectrum = [float(x) for x in input[n+2:]]
    
    # Find the masses for each protein
    masses = []
    for p in proteins:
        masses.append(GetMasses(p))
    
    # Find mode for each
    modes = []
    for m in masses:
        modes.append(CompareSpectra(m,spectrum))
    
    # Return protein w max modes, and that max
    max_mode = max(modes)
    max_index = modes.index(max_mode)
    max_protein = proteins[max_index]
    
    f.ExportToFile('rosalind_prsm_output.txt','\n'.join([str(max_index),max_protein]))
    return
Ejemplo n.º 9
0
def CyclicWithRC():
    """ Generates cyclic syperstring of minimal length
    with every read OR its reverse complement """
    kmers = f.LoadFile('\\rosalind_gasm.txt').splitlines()
    Graph = p.DeBruijnRC(kmers)
    
    # Get first node
    cycle = []
    node = random.choice(list(Graph.keys()))
    
    # extend cycle
    for i in range(len(Graph)//2):
        cycle.append(node)
        if Graph[node][0] in Graph:
            node = Graph[node][0]
        else:
            # Find node with most overlap
            # Use that one          
            node = MaxOverlap(Graph[node][0],Graph)
    
    # Merge into one string based on overlap
    superstring = cycle[0]
    for i in cycle[1:]:
        superstring = p.Combine(superstring,i)
    
    # Get rid of overlap at end of string
    k = len(superstring)
    for i in range(k-1,0,-1):
        if superstring[:i] == superstring[k-i:]:
            f.ExportToFile('rosalind_gasm_output.txt',superstring[:k-i])
            return
Ejemplo n.º 10
0
def DeBruijnRC():
    """ Returns adjacency list, based on
    given DNA strings and their reverse complements"""
    S = f.LoadFile('\\rosalind_dbru.txt').splitlines()

    # Make list of S U Src
    SuRC = []
    for i in S:
        SuRC.append(ReverseComplement(i))
    SuRC.extend(S)
    SuRC = RemoveDuplicates(SuRC)

    # Add all prefixes to adj_dict
    adj_dict = {}
    for kmer in SuRC:
        adj_dict[kmer[:-1]] = []

    for i in adj_dict:
        for j in SuRC:
            if i == j[:-1]:  # Look for strings with that prefix
                adj_dict[i].append(j[1:])  #If so, add suffix

    # Return in format
    output = []
    for i in adj_dict:
        for j in adj_dict[i]:
            output.append(('(%s, %s)' % (i, j)))

    f.ExportToFile('rosalind_dbru_output.txt', '\n'.join(output))
    return
Ejemplo n.º 11
0
def Subsets():
    """ Given positive int n, returns total number of subsets
    1:n modulo 1000000"""
    n = int(f.LoadFile('\\rosalind_sset.txt'))
    P = 2**n % 1000000
    f.ExportToFile('rosalind_sset_output.txt', str(P))
    return
Ejemplo n.º 12
0
def DistanceMatrix():
    """ Given n DNA strings (FASTA), returns distance matrix """
    input = f.LoadFile('\\rosalind_pdst.txt')
    [Label, DNA] = f.FASTA(input)

    # Initialize distance matrix
    D = []
    for _ in DNA:
        D.append([])
    for i in range(len(D)):
        for _ in range(len(D)):
            D[i].append(0)

    # Calculate Hamming Distance, add to matrix
    for i in range(len(DNA)):
        for j in range(len(DNA)):
            dist = HammingDistance(DNA[i], DNA[j])
            D[i][j] = str(dist / len(DNA[0]))

    # Properly format
    D_formatted = []
    for line in D:
        D_formatted.append(' '.join(line))
    f.ExportToFile('rosalind_pdst_output.txt', '\n'.join(D_formatted))
    return
Ejemplo n.º 13
0
def ErrorCorrection():
    """ Given list of DNA (FASTA) with correct reads occuring at least twice,
    returns incorrect reads and the corrected version."""
    input = f.LoadFile('\\rosalind_corr.txt')
    [Labels, DNA] = f.FASTA(input)

    correct_DNA = []
    # Read is correct if it appears at least twice,
    #-possibly as reverse complement
    for i in DNA:
        if Freq(i, DNA) > 1:
            correct_DNA.append(i)

    # Add all reverse complements to correct_DNA
    new_correct = []
    for i in correct_DNA:
        new_correct.append(i)
        new_correct.append(ReverseComplement(i))
    correct_DNA = RemoveDuplicates(new_correct)

    # Compare each read against the correct ones
    output = []
    for read in DNA:
        # If its in correct_Dna, ignore
        if read not in correct_DNA:
            # Find which string it matches best
            match = MinimumDistance(read, correct_DNA)
            # print in format
            output.append('%s->%s' % (read, match))
    f.ExportToFile('rosalind_corr_output.txt', '\n'.join(output))
    return
Ejemplo n.º 14
0
def download_case_by_upload_date(year, base_folder):
    caselists_folder = base_folder + '\\' + year + '\\caselists\\date'
    for case_list_csv in FileOperations.MyFolder(
            caselists_folder).get_file_list():
        csv_file = FileOperations.MyCsvFile(caselists_folder + '\\' +
                                            case_list_csv)
        print(csv_file.name)
        case_list = csv_file.read_dict()
        for i in range(len(case_list['name'])):
            cases_region_folder = base_folder + '\\' + year + '\\Cases\\' + case_list[
                'court'][i]
            FileOperations.MyFolder(cases_region_folder).create()
            # Remove ? in case name, it's invalide in windows.
            case_file = FileOperations.MyTextFile(
                cases_region_folder + '\\' +
                re.sub('[?#]', '', case_list['name'][i]) + '_' +
                case_list['doc_id'][i] + '.txt')
            # if not case_file.exists() and case_list['download'][i] != 'Invalid':
            if not case_file.exists():
                #                    if not case_file.exists() or case_file.get_size() < 1000 or  or case_list['download'][i] == 'N':
                print("Trying to download case {} in {}: {}...".format(
                    i, case_list_csv, case_list['name'][i]))
                t = download_case(case_list['doc_id'][i])
                print("Sleep 2s ...")
                time.sleep(2)
                if t:
                    case_file.write(t)
                    print("Case {} in {}: {} is downloaded.".format(
                        i, case_list_csv, case_list['name'][i]))
                else:
                    print(
                        "Case {} in {}: {} download failed. The content is empty."
                        .format(i, file, case_list['name'][i]))
    return None
Ejemplo n.º 15
0
def UnrootedTree():
    """ Given positive integer n, returns the number of internal
    nodes of any unrooted binary tree having n leaves.
    n leaves --> n-2 internal nodes"""
    n = int(f.LoadFile('\\rosalind_inod.txt'))    
    f.ExportToFile('rosalind_inod_output.txt',str(n-2))
    return
Ejemplo n.º 16
0
def Drift():
    """ Predicts probability that in a population of N diploid
    individuals initially possessing m copies of a dominant allele,
    we will observe after g generations at least k copies
    of a recessive allele (assuming Wright-Fisher model) """
    input = f.LoadFile('\\rosalind_wfmd.txt').split()
    N = int(input[0]) * 2
    m = int(input[1])  # initial num of copies of dom allele in pop (i)
    g = int(input[2])  # after g generations...
    k = int(input[3])  # prob that at least k copies of recessive (j)

    # Calculate probability of number of dominant alleles
    # Start with generation 0
    curr_gen = [0 for i in range(N + 1)]  # initialize as 0
    #-we know there is a 100% prob that there are m alleles
    #-everything else is 0
    curr_gen[m] = 1

    # iterate over generations
    for gen in range(g):
        next_gen = [0 for i in range(N + 1)]  #initialize as 0

        for i in range(N + 1):  #starting point
            for j in range(N + 1):  #ending point
                # temp-term = markov transition probability
                temp_term = nCr(N, i) * (j / N)**i * (1 - (j / N))**(N - i)
                # add to previous p (pA + pB = Ptotal)
                next_gen[i] += temp_term * curr_gen[j]

        curr_gen = next_gen  # update as current generation

    prob = str(sum(curr_gen[:-k]))  #sum = 'at least k'
    f.ExportToFile('rosalind_wfmd_output.txt', prob)
    return
Ejemplo n.º 17
0
 def bulk_button_released(self, event):
     if self.winfo_containing(event.x_root,
                              event.y_root) == self.bulk_button:
         template, image_path = self.master.image_viewer.get_bulk_process_data(
         )
         print('template: {}, image_path: {}'.format(template, image_path))
         if (template and image_path) is not None:
             tags, values = GetText.process_bulk(template, image_path)
             FileOperations.write_bulk(tags, values)
Ejemplo n.º 18
0
def ExpectedVal():
    """ Given positive int n and array P representing probabilities
    corresponding to an allel frequency, returns array B representing
    the expected allele frequency of the next generation """
    input = f.LoadFile('\\rosalind_ebin.txt').splitlines()
    n = int(input[0])
    P = [float(x) for x in input[1].split()]

    B = [str(round(i * n, 4)) for i in P]
    f.ExportToFile('rosalind_ebin_output.txt', ' '.join(B))
    return
Ejemplo n.º 19
0
def CompletingaTree():
    """ Given positive integer n and an adjacency list
    corresponding to a graph on n nodes that contains no cycles,
    returns the minimum number of edges that can be added to
    the graph to product a tree"""
    input = f.LoadFile('\\rosalind_tree.txt').splitlines()
    n = int(input[0])
    edges = len(input[1:])
    minimum = str(n - edges - 1)
    f.ExportToFile('rosalind_tree_output.txt', minimum)
    return
Ejemplo n.º 20
0
def InterleavingMotifs():
    [p,q] = f.LoadFile('\\rosalind_scsp.txt').splitlines()
    
    k = len(p)
    l = len(q)
    matrix = []
    matrix = MakeMatrixSCS(matrix,k,l,p,q)

    scs = InterpretMatrixSCS(matrix,k,l,p,q)
    f.ExportToFile('rosalind_scsp_output.txt',scs)
    return
Ejemplo n.º 21
0
def HammingDistance():
    """Returns the Hamming Distance between 2 strings"""
    input = f.LoadFile('\\rosalind_hamm.txt').splitlines()
    p = input[0]
    q = input[1]
    dist = 0
    for i in range(len(p)):
        if p[i] != q[i]:
            dist += 1
    f.ExportToFile('rosalind_hamm_output.txt', str(dist))
    return
Ejemplo n.º 22
0
def ProteinTomRNA():
    """ Returns total number of different RNA strings from which the
    protein could have been translated, modulo 1000000"""
    protein = f.LoadFile('\\rosalind_mrna.txt')
    protein += 'X' # add stop codon to end
    combo = 1
    for aa in protein:
        if aa in mRNA_dict:
            combo = combo*mRNA_dict[aa]
    f.ExportToFile('rosalind_mrna_output.txt', str(combo % 1000000))
    return
Ejemplo n.º 23
0
def downloadServer(IP, username, password, directory, fileList,
                   serverFileList):
    selectFile = serverFileList.get(serverFileList.curselection())
    server = ftplib.FTP()
    server.connect(IP, 21)
    server.login(username, password)
    #server.cwd('/webalizer')
    localfile = open(os.path.expanduser(directory) + "/" + selectFile, 'wb')
    server.retrbinary('RETR ' + selectFile, localfile.write, 1024)
    server.close()
    localfile.close()
    FileOperations.findFiles(directory, fileList)
Ejemplo n.º 24
0
 def save_template(self):
     if self.master.is_image_selected:
         event, value = enter_template()
         print(value)
         template_name = value[0]
         if event == 'OK' and template_name is not None:
             print("Temp name: ", template_name)
             template_dictionary = FileOperations.read_templates()
             template_dictionary[template_name] = [(self.image_width, self.image_height),
                                                   self.master.rectangle_coordinates]
             FileOperations.write_templates(template_dictionary)
             print("Templates available: ", template_dictionary)
Ejemplo n.º 25
0
def Splicing():
    """ Given a DNA substring and a collection of substrings acting as introns,
    returns a protein string from transcribing and translating exons"""
    input = f.LoadFile('\\rosalind_splc.txt')
    [Label, DNA] = f.FASTA(input)

    t = DNA[0]  # original string
    for substr in DNA[1:]:
        t = t.replace(substr, '')  # remove introns
    RNA = DNAtoRNA(t)
    f.ExportToFile('rosalind_splc_output.txt', RNAtoProtein(RNA))
    return
Ejemplo n.º 26
0
def IndependentAlleles():
    input = f.LoadFile('\\rosalind_lia.txt').split()
    k = int(input[0])
    N = int(input[1])

    P = 2**k
    prob = 0
    for i in range(N, P + 1):
        prob += nCr(P, i) * (0.25**i) * (0.75**(P - i)
                                         )  # formula for Mendel's 2nd Law
    f.ExportToFile('rosalind_lia_output.txt', str(prob))
    return
Ejemplo n.º 27
0
def GlobalAlignment():
    """ Uses MakeMatrix to return the maximum alignment score
    between 2 DNA strings (FASTA)"""
    input = f.LoadFile('\\rosalind_glob.txt')
    [Labels, [p, q]] = f.FASTA(input)

    k = len(p)
    l = len(q)
    matrix = []
    maxalign = MakeMatrixGlobal(matrix, k, l, p, q)
    f.ExportToFile('rosalind_glob_output.txt', str(maxalign))
    return
Ejemplo n.º 28
0
def Splicing():
    """ Returns sum of combinations C(n,k) for m<=k<=n, modulo 1000000 """
    [n, m] = f.LoadFile('\\rosalind_aspc.txt').split()
    n = int(n)
    m = int(m)

    count = 0
    for k in range(m, n + 1):
        count += nCr(n, k)

    f.ExportToFile('rosalind_aspc_output.txt', str(count % 1000000))
    return
Ejemplo n.º 29
0
def EditDistance():
    """ Given 2 strings, FASTA, returns the edit distance """
    input = f.LoadFile('\\rosalind_edit.txt')
    [Labels, [p, q]] = f.FASTA(input)

    k = len(p)
    l = len(q)
    matrix = []

    result = MakeMatrixDist(matrix, k, l, p, q)
    f.ExportToFile('rosalind_edit_output.txt', str(result))
    return
Ejemplo n.º 30
0
def Spectrum():
    """ Given prefix spectrum of protein, returns protein string"""
    L = f.LoadFile('\\rosalind_spec.txt').splitlines()
    L = list(reversed(sorted([float(x) for x in L])))

    protein = []
    for i in range(len(L) - 1):
        aa = round(L[i] - L[i + 1], 4)
        protein.insert(0, inv_massdict[aa])

    f.ExportToFile('rosalind_spec_output.txt', ''.join(protein))
    return
Ejemplo n.º 31
0
def detectUnstructuredModForm1(folder, outputFile):
    if isModulesExists(folder):
        return
    manifestsFolder = getManifestsFolder(folder) #repo-level manifests folder
    if manifestsFolder == "":
        Utilities.reportSmell(outputFile, manifestsFolder, CONSTS.SMELL_UNS_MOD_1, CONSTS.REPO_MANIFEST)
        return
    if FileOperations.countPuppetFiles(manifestsFolder) > CONSTS.MAX_MANIFESTS_PUPPET_FILES:
        Utilities.reportSmell(outputFile, manifestsFolder, CONSTS.SMELL_UNS_MOD_1, CONSTS.REPO_MANIFEST)
Ejemplo n.º 32
0
def analyze(folder, repoName):
    outputFile = open(folder + "/" + CONSTS.PUPPETEER_OUT_FILE, 'w')

    puppetFileCount = FileOperations.countPuppetFiles(folder)
    outputFile.write(CONSTS.PUPPET_FILE_COUNT + str(puppetFileCount) + "\n")
    Utilities.myPrint(CONSTS.PUPPET_FILE_COUNT + str(puppetFileCount))

    SizeMetrics.collectSizeMetrics(folder, outputFile)

    SmellDectector.detectSmells(folder, outputFile)

    outputFile.close()
    return
 def getField(self, netcdf_file):
     self.data = FileOperations.getField(netcdf_file)
     self.lon = self.data[0]
     self.lat = self.data[1]
     self.psi = self.data[2]
	def returnNetCDF(self, inputFileName, outputFileName, outField): 
		FileOperations.returnNetCDF( inputFileName, outputFileName, outField)
	def getField(self, netcdf_file): 
		FileOperations.getField(netcdf_file)
from sklearn import linear_model
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
import FileOperations as fo

training_data, training_data_class_labels = fo.load_twitter_data_from_file(fo.TRAINING_DATA_FILE_NAME)
testing_data, testing_data_class_labels = fo.load_twitter_data_from_file(fo.TESTING_DATA_FILE_NAME)

classifier = linear_model.LogisticRegression(C=1.0, penalty='l1', tol=1e-6)
classifier.fit(training_data, training_data_class_labels)
predicted_labels = classifier.predict(testing_data)
print("Confusion Matrix")
print(confusion_matrix(testing_data_class_labels, predicted_labels))
print("Precision")
print(precision_score(testing_data_class_labels, predicted_labels, average=None))
print("Recall")
print(recall_score(testing_data_class_labels, predicted_labels, average=None))
print("F1 score")
print(f1_score(testing_data_class_labels, predicted_labels, average=None))