Ejemplo n.º 1
0
def extract_descriptors_from_file_to_pickle(inputfile,
                                            outputfile,
                                            num_pos_sample=0):
    print("Working on: " + str(inputfile))
    print(" ")
    s_read_seq = time.time()
    if reduce_by_similarity == 1:
        if "_reduced" in inputfile:
            print(
                "File already reduced to be maximum 90 percent identical! Clear reduce_by_similarity!"
            )
            input()
        elif ".txt" in inputfile:
            name = inputfile.replace('.txt', '')
            file_to_reduce = open(inputfile)
            lines = file_to_reduce.readlines()
            if num_pos_sample != 0:
                lines = lines[:round(sc_1 * num_pos_sample)]
            line_number = len(lines)
            file_to_reduce.close()
        elif ".fasta" in inputfile:
            name = inputfile.replace('.fasta', '')
            lines = IO.read_fasta_file(inputfile)
            lines = [str(line) for line in lines]
            if num_pos_sample != 0:
                lines = lines[:round(2 * num_pos_sample)]
            line_number = len(lines)
        else:
            print(
                "Unknown file format! Use .fasta or .txt! Press CTRL-C to exit"
            )
            input()

        out = name + "_reduced.txt"
        deleted = []
        sim_array = np.zeros((line_number, line_number))

        for i in list(range(line_number)):
            print("Doing line %d out of %d" % (i, line_number))
            string1 = lines[i].strip()
            for j in list(range(i + 1, line_number)):
                #print(j)
                string2 = lines[j].strip()
                if similar(string1, string2) >= 0.9:
                    sim_array[i, j] = 1
                    sim_array[j, i] = 1

        while np.sum(np.sum(sim_array, 0)) != 0:
            sum_arr = np.sum(sim_array, 0)
            idx_to_be_deleted = np.argmax(sum_arr)
            sim_array = np.delete(sim_array, idx_to_be_deleted, 0)
            sim_array = np.delete(sim_array, idx_to_be_deleted, 1)
            deleted.append(lines[idx_to_be_deleted])
            del lines[idx_to_be_deleted]

        print("Deleted items:")
        [print(item) for item in deleted]

        f = open(out, "w+")
        for line in lines:
            f.write(line)
            f.write("\n")
        f.close()

        inputfile = out

    if ".txt" in inputfile:
        seqs = []
        with open(inputfile) as f:
            for line in f:
                seqs.append(
                    line.strip())  #strip is important otherwis /n issue!
        inputfile = inputfile.replace("_reduced.txt", "")
    elif ".fasta" in inputfile:
        seqs = IO.read_fasta_file(inputfile)
        inputfile = inputfile.replace("_reduced.fasta", "")
    else:
        print("Unknown file format! Use .fasta or .txt! Press CTRL-C to exit")
        input()
    e_read_seq = time.time()
    print("Total time to read sequences: " + str(e_read_seq - s_read_seq))
    print(str(len(seqs)))
    chars = set('ARNDCQEGHILKMFPSTWYV')

    if inputfile in negfile:
        if num_pos_sample == 0:
            print("Error, use Ctrl-C to quit")
            input()
        print(num_pos_sample)
        if num_pos_sample > len(seqs):
            print(
                "Warning: Class balance may not be achieved! Click any button to accept or CTRL-C to exit"
            )
            input()
        a = random.sample(
            range(1, len(seqs)), round(sc_2 * num_pos_sample)
        )  #if total_samples is big, you may want to divide total_samples (by 18) and round it
        newseqs = []
        i = 1
        for number in a:
            print(i)
            if len(seqs[number]) > minlength and all(
                (c in chars) for c in seqs[number].upper()):
                newseqs.append(seqs[number])
                print(seqs[number])
                i = i + 1
            if i > num_pos_sample:
                break
        if i < num_pos_sample:
            print(
                "The negative set does not contain enough valid inputs to make the classifier balanced. Reduce downsampling! Use CTRL-C to quit!"
            )
            input()
        seqs = newseqs
    total_samples = len(seqs)
    s_x_desc = time.time()
    dvecs = []
    current_seq = 1
    dropped = 0
    for s in seqs:
        s = s.upper()
        if inputfile not in negfile:  #because if it is valid sequence test has already been done,do not do it twice, purely for efficiency
            if not all((c in chars) for c in s) or len(s) < (minlength + 1):
                dropped = dropped + 1
                continue
        print("Extracting descriptors for sequence: " + str(current_seq) +
              "/" + str(total_samples))
        s_x_seq = time.time()
        dvec = FX.extract_named_descriptors_of_seq(s)
        dvecs.append(dvec)
        if inputfile in posfile:
            num_pos_sample = len(dvecs)
            print("Number of positive samples: %d" % (num_pos_sample))
            print(
                "Number of samples dropped due to meaningless characters: %d" %
                (dropped))
        e_x_seq = time.time()
        print("Took: " + str(e_x_seq - s_x_seq))
        print(" ")
        current_seq += 1
    e_x_desc = time.time()
    print("Total time to extract descriptors: " + str(e_x_desc - s_x_desc))

    IO.serialize_descriptor_vector(dvecs, o_file=outputfile)

    return num_pos_sample
def extract_descriptors_from_file_to_pickle(inputfile,
                                            outputfile,
                                            num_pos_sample=0):
    print("Working on: " + str(inputfile))
    print(" ")
    s_read_seq = time.time()
    if reduce_by_similarity == 1:
        if "_reduced" in inputfile:
            print(
                "File already reduced to be maximum 90 percent identical! Clear reduce_by_similarity!"
            )
            input()
        elif ".txt" in inputfile:
            name = inputfile.replace('.txt', '')
            file_to_reduce = open(inputfile)
            lines = file_to_reduce.readlines()
            if num_pos_sample != 0:
                lines = lines[:round(sc_1 * num_pos_sample)]
            line_number = len(lines)
            file_to_reduce.close()
        elif ".fasta" in inputfile:
            name = inputfile.replace('.fasta', '')
            lines = IO.read_fasta_file(inputfile)
            lines = [str(line) for line in lines]
            if num_pos_sample != 0:
                lines = lines[:round(sc_1 * num_pos_sample)]
            line_number = len(lines)
        else:
            print(
                "Unknown file format! Use .fasta or .txt! Press CTRL-C to exit"
            )
            input()

        out = name + "_reduced.txt"
        deleted = []
        sim_array = np.zeros((line_number, line_number))

        for i in list(range(line_number)):
            print("Doing line %d out of %d" % (i, line_number))
            string1 = lines[i].strip()
            for j in list(range(i + 1, line_number)):
                #print(j)
                string2 = lines[j].strip()
                if similar(string1, string2) >= 0.9:
                    sim_array[i, j] = 1
                    sim_array[j, i] = 1

        while np.sum(np.sum(sim_array, 0)) != 0:
            sum_arr = np.sum(sim_array, 0)
            idx_to_be_deleted = np.argmax(sum_arr)
            sim_array = np.delete(sim_array, idx_to_be_deleted, 0)
            sim_array = np.delete(sim_array, idx_to_be_deleted, 1)
            deleted.append(lines[idx_to_be_deleted])
            del lines[idx_to_be_deleted]

        print("Deleted items:")
        [print(item) for item in deleted]

        f = open(out, "w+")
        for line in lines:
            f.write(line)
            f.write("\n")
        f.close()

        inputfile = out

    if ".txt" in inputfile:
        seqs = []
        with open(inputfile) as f:
            for line in f:
                seqs.append(
                    line.strip())  #strip is important otherwis /n issue!
        inputfile = inputfile.replace("_reduced.txt", "")
    elif ".fasta" in inputfile:
        seqs = IO.read_fasta_file(inputfile)
        inputfile = inputfile.replace("_reduced.fasta", "")
    else:
        print("Unknown file format! Use .fasta or .txt! Press CTRL-C to exit")
        input()
    e_read_seq = time.time()
    print("Total time to read sequences: " + str(e_read_seq - s_read_seq))
    print(str(len(seqs)))
    chars = set('ARNDCQEGHILKMFPSTWYV')

    if inputfile in negfile:
        if num_pos_sample == 0:
            print("Error, use Ctrl-C to quit")
            input()
        print(num_pos_sample)
        if num_pos_sample > len(seqs):
            print(
                "Warning: Class balance may not be achieved! Click any button to accept or CTRL-C to exit"
            )
            input()
        a = random.sample(
            range(1, len(seqs)), round(sc_2 * num_pos_sample)
        )  #if total_samples is big, you may want to divide total_samples (by 18) and round it
        newseqs = []
        i = 1
        for number in a:
            print(i)
            if len(seqs[number]) > minlength and all(
                (c in chars) for c in seqs[number].upper()):
                newseqs.append(seqs[number])
                print(seqs[number])
                i = i + 1
            if i > num_pos_sample:
                break
        if i < num_pos_sample:
            print(
                "The negative set does not contain enough valid inputs to make the classifier balanced. Reduce downsampling! Use CTRL-C to quit!"
            )
            input()
        seqs = newseqs
    #s_x_desc = time.time()
    manager = Manager().list()
    current_seq = Value('i', 1)
    dropped = 0
    lock = Lock()
    seqs = [s.upper() for s in seqs]
    mask = [all((c in chars) for c in s) and len(s) > minlength for s in seqs]
    seqs = list(compress(seqs, mask))
    total_samples = len(seqs)
    pool = Pool(numcores, initializer,
                (current_seq, dvecs, total_samples, lock))
    s_parallel = time.time()
    pool.map(thefunction, seqs)
    e_parallel = time.time()
    #pool.close()
    #pool.join()
    print("Total time to extract descriptors: " + str(e_parallel - s_parallel))
    if inputfile in posfile:
        num_pos_sample = len(dvecs)
        print("Number of positive samples: %d" % (num_pos_sample))
    #e_x_desc = time.time()
    #print("Total time to extract descriptors: " + str(e_x_desc - s_x_desc))
    print("Number of samples dropped due to meaningless characters: %d" %
          (dropped))

    y = dvecs._callmethod('__getitem__', (slice(
        1,
        total_samples + 1), ))  #THIS IS THE SOLUTION TO MAKE PICKLE WORK!!!!!!
    IO.serialize_descriptor_vector(y, o_file=outputfile)

    return num_pos_sample