コード例 #1
0
    def test_generic(self):
        sll1 = self.generate_sll([1, 2, 2, 3, 3, 3])
        self.assertEqual([1, 2, 3], self.get_values_from_sll(remove_duplicates(sll1)))

        sll2 = self.generate_sll([1, 1, 1])
        self.assertEqual([1], self.get_values_from_sll(remove_duplicates(sll2)))

        sll3 = None
        self.assertEqual([], self.get_values_from_sll(remove_duplicates(sll3)))
コード例 #2
0
def merge():
    read_files = glob.glob("song_lyrics_raw/*.txt")

    with open("merged_lyrics.txt", "wb") as outfile:
        for f in read_files:
            with open(f, "rb") as infile:
                outfile.write(infile.read())
            outfile.write("\n")

    clean_bad_words.clean_bad_words("merged_lyrics.txt",
                                    "merged_lyrics_clean.txt")
    remove_duplicates.remove_duplicates("merged_lyrics_clean.txt",
                                        "merged_lyrics_unique.txt")
コード例 #3
0
def exon_coords(args):
    line_number1 = []
    line_number2 = []
    exon_file = open(args.exon).read().splitlines()
    for line in exon_file:
        col = line.split("\t")
        if col[0] == split_coords(
                args.coords)[0] and col[2] == "exon" and col[3] == str(
                    split_coords(args.coords)[1]):
            line_number1.append(exon_file.index(line))
        if col[0] == split_coords(
                args.coords)[0] and col[2] == "exon" and col[4] == str(
                    split_coords(args.coords)[2]):
            line_number2.append(exon_file.index(line))

    # Making list of exons composing circRNA
    exon_list = []
    for line in exon_file[line_number1[0]:line_number2[0] + 1]:
        col = line.split("\t")
        if col[2] == "exon":
            exon_list.append(col[0] + ":" + col[3] + "-" + col[4])

    exon_list = remove_duplicates(exon_list)

    return exon_list
コード例 #4
0
def exon_lists(line_number1, line_number2, gtf_file):
    exon_list = []
    for line in gtf_file[line_number1[0]:line_number2[0] + 1]:
        col = line.split("\t")
        if col[2] == "exon":
            exon_list.append(col[0] + ":" + col[3] + "-" + col[4])
    exon_list = remove_duplicates(exon_list)

    return exon_list
コード例 #5
0
    def test_duplicates(self):
        inputs = [('a', 'a', 'a', 'a'), (1, 2, 2, 3, 2, 1)]
        outputs = [('a'), (1, 2, 3)]
        for in_list, out_list in zip(inputs, outputs):
            in_head = node.Node(in_list[0])
            in_n = in_head
            for value in in_list:
                in_n.next = node.Node(value)
                in_n = in_n.next

            head = remove_duplicates.remove_duplicates(in_head)
            n = head
            for value in out_list:
                self.assertEqual(n.data, value)
                n = n.next
コード例 #6
0
def reverse_pair(array):
    half = int(len(array)/2 + 0.5)
    pairs = []
    for word in array[0:half]:
        splitted = list(word)
        splitted.reverse()
        reversed = ''.join(splitted)
        if bisect(array, reversed) == True:
            pairs.append(word)
            pairs.append(reversed)
            #print ("pairs = ", pairs)
        else:
            pass
    final_pairs = remove_duplicates(pairs)
    return final_pairs
コード例 #7
0
def run_locarnap(seqsin, numkept, cpus=1, foldless=False):
    """Runs locarna-p on a set of sequences in MinimalFastaParser format
    [(header, seq), (header, seq)] and retgurns alignemtn and structure"""
    seqs, headers = remove_duplicates(seqsin)
    # blank headers to save memory
    headers = 0
    # make sure group has enough sequences before continuing
    if len(seqs) < numkept and not foldless:
        return "", ""
    # headers come out in format Header_# so split to get # and sort by abundance
    seqs.sort(reverse=True, key=lambda count: int(count[0].split("_")[1]))
    # cut to numkept most abundant sequences
    if len(seqs) > numkept:
        seqs = seqs[:numkept]
    return create_locarnap_alignment(seqs, RNA, struct=True, params={"--cpus": cpus})
コード例 #8
0
def run_locarnap_for_infernal(currgroup, clusters, otus, basefolder):
    '''Function for multithreading
    creates the final locarna-p alignment and writes to files, then r2r struct'''
    #run locana-p on the superclusters to get the alignment and consensus structure
    #skip if already run and program just crashsed or whatever
    currotufolder = basefolder + "group_" + str(currgroup)
    if exists(currotufolder):
        return ""
    seqs = []
    out = "group " + str(currgroup) + ": "
    for cluster in clusters:
        out += cluster + " "
        count = 0
        for header, seq in MinimalFastaParser(open(otus[cluster], 'rU')):
            seqs.append((header.split()[0], seq))
            count += int(header.split("_")[1])
    out += "\n" + str(count) + " sequences\n"
    #make sure group has enough sequences before continuing
    #run locarna-p on the at most 50 most abundant sequences in the group
    aln, struct = run_locarnap(seqs, 50, cpus=2, foldless=True)

    #create output folder for group
    mkdir(currotufolder)
    if(aln.getNumSeqs() < 50):
        out += str(aln.getNumSeqs()) + " unique sequences\n"
        fout = open(currotufolder + "/unique.fasta", 'w')
        fout.write(aln.toFasta())
        fout.close()
    else:
        s, h = remove_duplicates(seqs)
        out += str(len(s)) + " unique sequences\n"
        write_fasta_list(s, currotufolder + "/unique.fasta")
    out += "Structure: " + struct + "\n"

    #write out alignment and structure in fasta and stockholm formats
    #write that shit
    logout = open(currotufolder + "/log.txt", 'w')
    logout.write(out)
    logout.close()
    alnout = open(currotufolder + "/locarnap-aln.fasta", 'w')
    alnout.write(aln.toFasta() + "\n>SS_struct\n" + struct + "\n")
    alnout.close()
    alnout = open(currotufolder + "/locarnap-aln.sto", 'w')
    struct_dict = {'SS_cons': struct}
    alnout.write(stockholm_from_alignment(aln, GC_annotation=struct_dict))
    alnout.close()
    #make R2R secondary structure for alignment
    make_r2r(currotufolder + "/locarnap-aln.sto", currotufolder, "group_" + str(currgroup))
コード例 #9
0
 def test_continuous_duplicates(self):
     test = "aaabbbcccddd"
     self.assertEqual(remove_duplicates(test), "abcd")
コード例 #10
0
 def test_null_string(self):
     test = ""
     self.assertEqual(remove_duplicates(test), "")
コード例 #11
0
 def test_all_duplicates(self):
     test = "aaa"
     self.assertEqual(remove_duplicates(test), "a")
コード例 #12
0
 def test_remove_duplicates(self):
     remove_duplicates(self.default_list)
     self.assertEqual(self.default_list.size(), 4)
コード例 #13
0
 def test_remove_duplicates(self):
     remove_duplicates(self.default_list)
     self.assertEqual(self.default_list.size(), 4)
コード例 #14
0
import os
import sys
import traceback
import ntpath
from PIL import Image

from match_files import match_files
from categorize_files import categorize_files
from convert_to_jpg import convert_to_jpg
from read_pgm import read_pgm
from remove_duplicates import remove_duplicates

if __name__ == '__main__':
    path = sys.argv[1]
    print "Matching files"
    match_files(path)
    print "Removing duplicates"
    remove_duplicates(path)
    print "Converting to JPEG"
    convert_to_jpg(path)
    print "Categorizing files"
    categorize_files(path)
	print "Finished."
コード例 #15
0
def main(argv, seed=None):
    random.seed(seed)
    global current_config
    current_config = get_default_config()
    arg = argv[1]
    arg = arg.replace("\\", "/")
    ar1 = arg
    orig_file = ar1
    mut_dir = arg[arg.rfind("/")+1:arg.rfind(".")] if arg.rfind("/") >= 0 else arg[:arg.rfind(".")]
    script_name = mut_dir
    mut_dir = (current_config["default_mut_dir"]+"/").replace("//","/") + mut_dir + "/"
    # Add script's directory to path
    sys.path.insert(0, mut_dir)
    # Store the reason why the mutation was completed
    mutants_with_cause = []
    # Timeout since our modifications may cause infinite loops
    timeout = int(current_config["min_timeout"]) if len(argv) < 4 or not argv[3] else argv[3]
    if not os.path.exists(mut_dir):
        os.makedirs(mut_dir)
    else:
        cleanup(mut_dir)

    # Index of the string currently processed
    str_cnt = 0
    # Mutation counter
    mut_cnt = 0
    pick_file = argv[2] if len(argv) > 2 else current_config["default_rejected"]
    pick_handle = open(pick_file, 'rb')
    rej_strs = pickle.load(pick_handle)
    if not rej_strs:
        raise SystemExit("File: " + pick_file + " contains no inputs.")

    # Precompute the locations of conditions and the lines of their then and else case and format the file properly
    global manual_errs
    manual_errs = argtracer.compute_base_ast(ar1, mut_dir + script_name + ".py")
    ar1 = mut_dir + script_name + ".py"

    # Record how long the slowest execution takes to have a better prediction of the required timeout
    slowest_run = 0
    # Get base values from the non-crashing run with the most conditions traversed
    progress = 1
    base_conds = []
    ln_cond = -1
    for cand in rej_strs:
        pos = 0
        print("Mutated string:", repr(cand[0]), flush=True)
        print("Valid string:", repr(cand[1]), flush=True)
        base_index = 0
        for str_inpt in cand:
            start_time = timer()
            try:
                print("Tracing:", progress, "/", 2*len(rej_strs), flush=True)
                (_,base_cond,_,someerror) = argtracer.trace(ar1, str_inpt)
                if pos == 1:
                    base_conds.append(base_cond)
                    if len(base_cond) > ln_cond:
                        basein = cand[1]
                        base_pos = base_index
                        ln_cond = len(base_cond)
                    if someerror:
                        raise SystemExit("Invalid input: " + repr(str_inpt) + ".\nAborted.")
                    base_index += 1
            finally:
                pos += 1
                time_elapsed = timer() - start_time
                if time_elapsed > slowest_run:
                    slowest_run = time_elapsed
                progress += 1
    # Choose a timeout that is very likely to let valid mutants finish
    timeout = max(timeout, int(int(current_config["timeout_slow_multi"])*slowest_run)+1)
    try:
        (_, b_cdict, _, err) = argtracer.trace(ar1, basein, timeout=timeout)
    except Timeout:
        print("Execution timed out on basestring! Try increasing timeout (currently", timeout," seconds)")    

    if err:
    	raise SystemExit("Exiting: " + pick_file + " contains no valid inputs for " + ar1)

    # Remove duplicates (same condition trace) from valid inputs
    idxl = 0
    idxr = 0
    while idxl < len(base_conds):
        idxr = idxl+1
        while idxr < len(base_conds):
            if get_frozen(base_conds[idxl]) == get_frozen(base_conds[idxr]):
                del base_conds[idxr]
            else:
                idxr += 1
        idxl += 1

    print("Amount of unique base strings:", len(base_conds), flush=True)

    print("Used baseinput:", repr(basein))

    # Log the inputs since they are determined already
    input_log = LogWriter(mut_dir[:-1] + "_inputs.log")
    for i in range(len(rej_strs)):
        input_log.append_line(str(i) + ": " + repr(rej_strs[i][0])+"\n")
    input_log.append_line("The baseinput was: " + repr(basein))

    lwriter = LogWriter(mut_dir[:-1] + ".log")
    lwriter.append_line("Mutating script: " + repr(orig_file) + "\n")

    all_generated = { int_key : [] for int_key in range(len(base_conds)) }

    # Run the mutation process for every rejected string
    for s in rej_strs:
        s = s[0]
        if int(current_config["variable_base"]) == 0:
            queue = [(ar1, [], 0, None, None, None, base_index)]
        else:
            queue = []
            for base_index in range(len(base_conds)):
                queue.append((ar1, [], 0, None, None, None, base_index))
        discarded = set()
        # Save which exception the first execution of the rejected string produced
        original_ex_str = None
        # Stores which exceptions the valid string caused
        except_set = set()
        # The set of final lines observed by mutants rejecting the valid string
        rej_sigs = set()
        while queue:
            (arg, history, retries, pidx, pmstate, scstate, b_cindex) = queue.pop(0)
            skip = False
            b_cdict = base_conds[b_cindex]
            print("Current script:", arg, flush=True)
            # Check whether the chosen correct string is now rejected
            try:
                _mod = imp.load_source('mymod', arg)
            except:
                print("Discarded script:", arg, "(import error)", flush=True)
                os.remove(arg)
                continue
            print("Executing basestring...", flush=True)
            try:
                (lines, _, _, berr) = argtracer.trace(arg, basein, timeout=timeout)
            except argtracer.Timeout:
                print("Discarding:", arg, "(basestring timed out)", flush=True)
                os.remove(arg)
                continue

            # Remove lines used to construct custom exceptions
            lines = manual_errs.remove_custom_lines(lines)

            # If the crash happens on a condition we modified there is a high chance it's invalid, so we remove it.
            if lines[0] in history:
                print("Removed:", arg, "(potentially corrupted condition)", flush=True)
                os.remove(arg)
                continue

            # Mutation guided by rejected strings

            try:
                (lines, cdict, _, err) = argtracer.trace(arg, s, timeout=timeout)
            except:
                print("Discarding:", arg, "(mutated string timed out)", flush=True)
                os.remove(arg)
                continue

            # Remove lines used to construct custom exceptions
            lines = manual_errs.remove_custom_lines(lines)
            # If the crash happens on a condition we modified there is a high chance it's invalid, so we remove it.
            if lines[0] in history:
                print("Removed:", arg, "(potentially corrupted condition)", flush=True)
                os.remove(arg)
                continue

            if original_ex_str is None:
                if err == False:
                    print("Skipping string:", s, "(not rejected)!", flush=True)
                    continue
                else:
                    original_ex_str = str(err.__class__)

            # Check whether the modification changed the condition state
            skip = pmstate is not None and cdict.get(history[-1]) is not None and cdict.get(history[-1]) == pmstate

            if skip:
                print("Removed:", arg, "(unsuccessful modification)", flush=True)
                if retries < int(current_config["mut_retries"]) and pidx:
                    # Try again
                    full_str = manual_errs.get_if_from_line(history[-1], ar1)
                    cond_str = full_str[full_str.find("if")+3:full_str.rfind(":")]
                    inpt_ast = ast.fix_missing_locations(ast.parse(cond_str))
                    mtrans = MutTransformer(pidx)
                    res = mtrans.visit(inpt_ast)
                    fix = full_str[:full_str.find("if")+2] + " " + astunparse.unparse(res).lstrip().rstrip() + ":"
                    if not fix.endswith("\n"):
                        fix = fix + "\n"
                        mods = { history[-1] : fix }
                        cand = mut_dir + script_name + "_" + str(str_cnt) + "_" + str(mut_cnt) + ".py"
                        queue.insert(0,(cand, history.copy(), retries+1, pidx, pstate, None, b_cindex))
                        file_copy_replace(cand, arg, mods)
                        mut_cnt += 1
                elif retries >= int(current_config["mut_retries"]):
                    print("Retries exceeded:", arg, flush=True)
                os.remove(arg)
                continue

            sskip = (scstate is not None and cdict.get(history[-1]) is not None and cdict.get(history[-1]) == scstate)
            # Retries would be possible here as well, but since our search is blind for these conditions it's skipped
            if sskip:
                print("Removed:", arg, "(unsuccessful modification) (sec)", flush=True)
                os.remove(arg)
                continue

            if berr and (lines[0] not in rej_sigs or berr not in except_set):
                print("Mutation complete:", arg, "(base rejected)", flush=True)
                print("Exception for base on", arg, ":", repr(berr), flush=True)
                mutants_with_cause.append((arg, "valid string rejected"))
                lwriter.append_line(repr(mutants_with_cause[-1]) + "\n")

            (prim, sec) = get_left_diff(cdict, b_cdict)
            # Remove all elements that have been explored (history) or do not belong to the actual code (i.e. error constructor - lines)
            prim = [e for e in prim if e[0] not in history and e[0] in lines]
            sec = [e for e in sec if e[0] not in history and e[0] in lines] if int(current_config["blind_continue"]) else []
           
            # Don't create mutants if their line combination is already in the queue
            prim = [] if not prim else rm_dups(prim, history, all_generated, b_cindex)

            # Sec will never be progressed if prim is not empty
            sec = [] if not sec or len(prim) > 0 else rm_dups(sec, history, all_generated, b_cindex)

            print("Used string:", repr(s), flush=True)
            print("Queue length:", len(queue), flush=True)
            print("Change history:", history, flush=True)
            print("Difference to base (flipped):", prim, flush=True)
            print("Difference to base (new):", sec, flush=True)
            print("Final line:", str(lines[0]), flush=True)
            print("", flush=True)
            if err:
            	# Check whether the exception is different from the first encountered one
            	diff_err = str(err.__class__) != original_ex_str
            	err = True
            print("Mutated string rejected:", err, "different:", diff_err, flush=True)
            if (err and not diff_err) or int(current_config["early_stop"]) == 0:
                all_fixes = get_possible_fixes((prim, sec), arg)
                if all_fixes:
                    for (fix_list, fix_line, pstate, sstate) in all_fixes:
                        # Create a mutant for every possible fix
                        for (fix, permindex) in fix_list:
                            if not fix.endswith("\n"):
                                fix = fix + "\n"
                            cand = mut_dir + script_name + "_" + str(str_cnt) + "_" + str(mut_cnt) + ".py"
                            mods = { fix_line : fix }
                            queue.insert(0,(cand, history.copy()+[fix_line],0, permindex, pstate, sstate, b_cindex))
                            file_copy_replace(cand, arg, mods)
                            mut_cnt += 1
            # Check whether the mutant is valid (rejects base or accepts mutated string) and record its behaviour
            if arg != ar1:
                if not err or diff_err:
                	print("Mutation complete:", arg, "(mutated string accepted)", flush=True)
                	mutants_with_cause.append((arg, "mutated string accepted"))
                	lwriter.append_line(repr(mutants_with_cause[-1]) + "\n")
                elif not berr or (berr and (lines[0] in rej_sigs and berr in except_set)):
                    discarded.add(arg)
                    rej_sigs.add(lines[0])
                    except_set.add(berr)
            		
        # Don't delete the original script, we need it to create mutants from whenever a new rejected string is processed
        discarded.discard(ar1)
        # Remove all scripts that neither reject the base string nor accept the mutated string
        for scrpt in discarded:
            print("Removed:", scrpt, flush=True)
            os.remove(scrpt)
        # Adjust the file naming
        str_cnt += 1
        mut_cnt = 0
        print("Processing string number:", str(str_cnt), "/", str(len(rej_strs)),flush=True)
    # Move the copy of the original script since it is not a mutant
    orig_out = current_config["default_mut_dir"] + ar1[ar1.rfind("/")+1:]
    if os.path.exists(orig_out):
        os.remove(orig_out)
    os.rename(ar1, orig_out)
    print("Done. The final mutants are in:", mut_dir)
    # Remove duplicates and update the log accordingly
    mutants_with_cause = remove_duplicates(mut_dir, ".py", mutants_with_cause)

    lwriter = LogWriter(mut_dir[:-1] + ".log")
    lwriter.append_line("Mutating script: " + repr(orig_file) + "\n")
    for e in mutants_with_cause:
        lwriter.append_line(repr(e) + "\n")
コード例 #16
0
def phonetise_word(arabic_word):
    utterances = [arabic_word]
    arabic_word = arabic_utils.remove_diacritics(arabic_word)
    result = ''  # Pronunciations Dictionary
    utterances_pronunciations = [
    ]  # Most likely pronunciation for all utterances
    utterances_pronunciations_with_boundaries = [
    ]  # Most likely pronunciation for all utterances
    pronunciations = []
    phones = []
    # -----------------------------------------------------------------------------------------------------
    # Loop through utterances------------------------------------------------------------------------------
    # -----------------------------------------------------------------------------------------------------
    utterance_number = 1
    for utterance in utterances:
        utterance_number += 1
        utterances_pronunciations.append(
            '')  # Add empty entry that will hold this utterance's pronuncation
        # Add empty entry that will hold this utterance's pronuncation
        utterances_pronunciations_with_boundaries.append('')

        utterance = convert(utterance)
        # ---------------------------
        word_index = -1

        # Loop through words
        for word in utterance:
            word_index += 1
            if word not in [u'-', u'sil']:
                pronunciations = [
                ]  # Start with empty set of possible pronunciations of current word
                # Add fixed irregular pronunciations if possible
                result = isFixedWord2(word, result, word, pronunciations)
                # Indicates whether current character is in an emphatic context or not. Starts with False
                emphaticContext = False
                word = u'##' + word + u'##'  # This is the end/beginning of word symbol. just for convenience

                phones = [
                ]  # Empty list which will hold individual possible word's pronunciation

                # -----------------------------------------------------------------------------------
                # MAIN LOOP: here is where the Modern Standard Arabic phonetisation rule-set starts--
                # -----------------------------------------------------------------------------------
        # MAIN LOOP: here is where the Modern Standard Arabic phonetisation rule-set starts--
        # -----------------------------------------------------------------------------------
        for index in range(2, len(word) - 2):
            letter = word[index]  # Current Character
            nextCharacter = word[index + 1]  # Next Character
            afterNextCharacter = word[index + 2]  # Next-Next Character
            previousCharacter = word[index - 1]  # Previous Character
            beforePreviousCharacter = word[index -
                                           2]  # Before Previous Character

            emphaticContext = emphatic_context.getState(letter, nextCharacter)
            if letter in constants.unambiguousConsonantMap:
                phones.append(constants.unambiguousConsonantMap[letter])
            # ----------------------------------------------------------------------------------------------------------------
            if letter == u'l':  # Lam is a consonant which requires special treatment
                phones += handle_characters.lam(beforePreviousCharacter,
                                                previousCharacter,
                                                nextCharacter,
                                                afterNextCharacter)
            # ----------------------------------------------------------------------------------------------------------------
            # shadda just doubles the letter before it
            if letter == u'~' and previousCharacter not in [
                    u'w', u'y'
            ] and len(phones) > 0:
                phones[-1] += phones[-1]
            # ----------------------------------------------------------------------------------------------------------------
            if letter == u'|':  # Madda only changes based in emphaticness
                phones += handle_characters.madda(emphatic_context)
            # ----------------------------------------------------------------------------------------------------------------
            if letter == u'p':  # Ta' marboota is determined by the following if it is a diacritic or not
                phones += handle_characters.p(nextCharacter)

            if letter in constants.vowelMap:
                # Waw and Ya are complex they could be consonants or vowels and their gemination is complex as
                # it could be a combination of a vowel and consonants
                phones += handle_characters.handle_vowels(
                    previousCharacter, letter, nextCharacter,
                    afterNextCharacter, emphaticContext)
                # Kasra and Damma could be mildened if before a final silent consonant
                if letter in [u'u', u'i']:
                    phones += handle_characters.kasra_and_damma(
                        word, letter, emphaticContext, nextCharacter,
                        afterNextCharacter)
                # Alif could be ommited in definite article and beginning of some words
                if letter in [u'a', u'A', u'Y']:
                    phones += handle_characters.alef(beforePreviousCharacter,
                                                     previousCharacter, letter,
                                                     nextCharacter,
                                                     emphaticContext)
    pronunciations += get_different_possible_pronounciations(phones)
    pronunciations = remove_duplicates(pronunciations)

    return [
        ' '.join(item) for item in pronunciations
        if len(item) >= len(arabic_word)
    ]
コード例 #17
0
ファイル: test_rd.py プロジェクト: cbohara/FHP_challenges
 def test_output(self):
     """Are all duplicates removed?"""
     duplicates = [1, 2, 2, 3, 3, 3, 4, 4, 4, 4]
     output = remove_duplicates(duplicates)
     self.assertEqual(output, [1,2,3,4])
コード例 #18
0
 def test_no_duplicates(self):
     test = "abcd"
     self.assertEqual(remove_duplicates(test), "abcd")
コード例 #19
0
    lines_to_keep = []
    current_run = []

    for line in file:
        Q = float(line.split(",")[0])

        if Q < -1:
            #finished a run, so randomly pick a number
            # with probability 1/4 pick the maximum
            if random.random() < 0.25:
                lines_to_keep.append(current_run[-1])
            else:
                lines_to_keep.append(pick_random_partition(current_run))
            current_run = []
        
        current_run.append(line)

    file.close()

    if options.ensure_uniqueness:
        lines_to_keep = remove_duplicates(lines_to_keep)
        
    if options.filename == None:
        for line in lines_to_keep:
            print line,
    else:
        output_file = open(options.filename,"w")
        for line in lines_to_keep:
            output_file.write(line)
        output_file.close()     
コード例 #20
0
        # run locana-p on the superclusters to get the alignment and consensus structure
        seqs = []
        out = "group " + str(currgroup) + ": "
        for cluster in structgroups[currstruct]:
            out += cluster + " "
            for header, seq in MinimalFastaParser(open(otus[cluster], "rU")):
                seqs.append((header, seq))
        print out
        print str(len(seqs)) + " sequences"
        # make sure group has enough sequences before continuing
        # run locarna-p on the at most 50 most abundant sequences in the group
        aln, struct = run_locarnap(seqs, 50, cpus=args.c, foldless=True)
        if aln.getNumSeqs() < 50:
            print str(aln.getNumSeqs()) + " unique sequences"
        else:
            s, h = remove_duplicates(seqs)
            print str(len(s)) + " unique sequences"
            s = 0
            h = 0
        print "Structure: " + struct

        # print out alignment and structure in fasta and stockholm formats
        # create output folder for group
        currotufolder = otufolder + "group_" + str(currgroup)
        if not exists(currotufolder):
            mkdir(currotufolder)
        # print that shit
        alnout = open(currotufolder + "/locarnap-aln.fasta", "w")
        alnout.write(aln.toFasta() + "\n>SS_struct\n" + struct + "\n")
        alnout.close()
        alnout = open(currotufolder + "/locarnap-aln.sto", "w")
コード例 #21
0
    lines_to_keep = []
    current_run = []

    for line in file:
        Q = float(line.split(",")[0])

        if Q < -1:
            #finished a run, so randomly pick a number
            # with probability 1/4 pick the maximum
            if random.random() < 0.25:
                lines_to_keep.append(current_run[-1])
            else:
                lines_to_keep.append(pick_random_partition(current_run))
            current_run = []

        current_run.append(line)

    file.close()

    if options.ensure_uniqueness:
        lines_to_keep = remove_duplicates(lines_to_keep)

    if options.filename == None:
        for line in lines_to_keep:
            print line,
    else:
        output_file = open(options.filename, "w")
        for line in lines_to_keep:
            output_file.write(line)
        output_file.close()
コード例 #22
0
 # Get list of OTUs from file, populate dictionary
 # need to add -i for input, -o for out folder, -c for cpus, -r current round
 otus = []
 fn = open(argv[1], "rU")
 for line in fn:
     lineinfo = line.strip().split()
     otus.append((lineinfo[0], lineinfo[1]))
 fn.close()
 for currotu in otus:
     otu = currotu[0]
     print "==" + otu + "=="
     print "Reading in 30 most abundant sequences"
     # assuming that the fasta has more than 30 sequences in it. Safe assumption
     # if this is a significant cluster
     seqs = [(header, seq) for header, seq in MinimalFastaParser(open(currotu[1], "rU"))]
     seqs, headers = remove_duplicates(seqs)
     # blank headers to save memory
     headers = 0
     # headers come out in format Header_# so split to get # and sort by abundance
     seqs.sort(reverse=True, key=lambda count: int(count[0].split("_")[1]))
     # cut to 30 most abundant sequences
     seqs = seqs[:30]
     print "Running locarna-p on sequences"
     args = {"--cpus": "24"}
     aln, struct = create_locarnap_alignment(seqs, RNA, struct=True, params=args)
     # create output folder for OTU
     otufolder = "/Users/Ely/Desktop/Ely_selection/R7/lead_clusters/"
     if not exists(otufolder):
         mkdir(otufolder)
     otufolder += otu
     if not exists(otufolder):
コード例 #23
0
 def test_one_node(self):
     inputs = ['a', 'b', 'c']
     for head_value in inputs:
         head = node.Node(head_value)
         self.assertEqual(remove_duplicates.remove_duplicates(head), head)
コード例 #24
0
 def test_remove_duplicates(self):
     self.assertEqual(remove_duplicates([0, 0, 1, 1, 1, 2, 2, 3, 3, 4]), 5)
     self.assertEqual(remove_duplicates([1, 1, 2]), 2)
     self.assertEqual(remove_duplicates([0, 1, 2, 3, 4]), 5)
コード例 #25
0
 def test_output(self):
     """Are all duplicates removed?"""
     duplicates = [1, 2, 2, 3, 3, 3, 4, 4, 4, 4]
     output = remove_duplicates(duplicates)
     self.assertEqual(output, [1, 2, 3, 4])
コード例 #26
0
ファイル: clean_seqs.py プロジェクト: squirrelo/SelexTrace
        str(len(rem)) + " sequences removed")
        print str(len(kept)) + " sequences left, " + \
        str(len(rem)) + " sequences removed. " + str((time() - secs)/60) + " minutes\n"
        write_fasta_list(kept, currfolder + "-Stripped.fasta")
        write_fasta_list(rem, currfolder + "-NotStripped.fasta")
        rem = 0
        #remove all sequences with Ns and short sequences
        print "Remove short and ambiguous sequences"
        secs = time()
        kept = rem_N_short(kept, args.l)
        log.write("Remove short and ambiguous sequences\n" + str(len(kept)) + " sequences left\n")
        print str(len(kept)) + " sequences left. " + str((time() - secs)/60) + " minutes"
        write_fasta_list(kept, currfolder + "-CleanStripped.fasta")

        #remove duplicate sequences from the fasta file and store for later
        print "Remove duplicates"
        secs = time()
        kept, headers = remove_duplicates(kept)
        write_fasta_list(kept, currfolder + "-Unique.fasta")
        #write out file holding headers keyed to a sequence
        keyfile = open(currfolder + "-seqtoheaders.txt", 'w')
        for key in headers:
            keyfile.write(key + "\t")
            for item in headers[key]:
                keyfile.write(item + ",")
            keyfile.write("\n")
        keyfile.close()
        log.write("Remove duplicates\n" + str(len(kept)) + " sequences left")
        print str(len(kept)) + " sequences left. " + str((time() - secs)/60) + " minutes\n"
        log.close()
コード例 #27
0
 def test_empty(self):
     inputs = [None]
     for head in inputs:
         self.assertEqual(remove_duplicates.remove_duplicates(head), head)