コード例 #1
0
ファイル: align.py プロジェクト: udincer/crude_aligner
def align_paired_read(paired_read, ref, ref_hash):
    NUM_PIECES_TO_MATCH_FOR_PRIMARY = 2
    NUM_PIECES_TO_MATCH_FOR_SECONDARY = 2

    orientations = [(paired_read[0], paired_read[1][::-1]),
                    (paired_read[1], paired_read[0][::-1])]

    location_read_tuple_list = []

    for oriented_paired_read in orientations:
        primary_read = oriented_paired_read[0]
        secondary_read = oriented_paired_read[1]

        primary_locations = align_single_read(primary_read, ref_hash, c.KEY_SIZE,
                                              NUM_PIECES_TO_MATCH_FOR_PRIMARY)

        if len(primary_locations) < 1:
            continue

        secondary_locations = align_single_read(secondary_read, ref_hash, c.KEY_SIZE,
                                                NUM_PIECES_TO_MATCH_FOR_SECONDARY)

        for pl in primary_locations:
            secondary_location_f = [x for x in secondary_locations
                        if  pl + c.PAIRED_GAP_ESTIMATE_INTERVAL[0] < x < pl + c.PAIRED_GAP_ESTIMATE_INTERVAL[1] or
                            pl - c.PAIRED_GAP_ESTIMATE_INTERVAL[1] < x < pl - c.PAIRED_GAP_ESTIMATE_INTERVAL[0]]
            if 0 < len(secondary_location_f) < 2: # it's probs a repetitive region if > 1
                lrt_primary = (pl, utils.key_to_integer(primary_read))
                lrt_secondary = (secondary_location_f[0], utils.key_to_integer(secondary_read))
                location_read_tuple_list.append(lrt_primary)
                location_read_tuple_list.append(lrt_secondary)

    return location_read_tuple_list
コード例 #2
0
ファイル: hash_genome.py プロジェクト: udincer/crude_aligner
def read_file_hash_and_pickle(file_path, key_size, pickle_filename):
    ref = ''
    with open(file_path, 'r') as f:
        first_line = True
        line_count = 0
        for line in f:
            if first_line:
                first_line = False
                continue
            ref += line.strip()
            if line_count%100000==0:
                print 'lines read: {}'.format(line_count)
            line_count += 1

    print 'initializing dict'
    ref_hash = defaultdict(list)
    for i in xrange(len(ref)-key_size+1):
        key = ref[i:i+key_size]
        intkey = utils.key_to_integer(key)
        ref_hash[intkey].append(i)
        if i%100000==0:
            print 'hashing position {}'.format(i)

    print 'pickling hashed dict'
    pickle.dump(ref_hash, open(pickle_filename, 'wb'))
コード例 #3
0
ファイル: align.py プロジェクト: udincer/crude_aligner
def align_single_read(read, ref_hash, key_size,
                      min_num_pieces = c.MINIMUM_NUMBER_OF_PIECES_TO_MATCH):
    read_pieces = [read[i * key_size: (i + 1) * key_size] for i in xrange(len(read) / key_size)]
    read_piece_locations = [ref_hash[utils.key_to_integer(read_pieces[i])]
                            for i in xrange(len(read_pieces))]
    start_positions = [[x - i * key_size for x in read_piece_locations[i]]
                       for i in xrange(len(read_piece_locations))]

    start_position_counter = Counter()

    for start_position in start_positions:
        start_position_counter.update(start_position)

    viable_locations = []
    for location in start_position_counter:
        if start_position_counter[location] >= min_num_pieces:
            viable_locations.append(location)

    return viable_locations
コード例 #4
0
def ident_bad_regions(some_pr_tuples, ref):
    nonperfect_pos_read_tuples = []
    good_areas = bitarray(len(ref))
    good_areas.setall(False)
    start = time.time()
    count = 0.0
    for t in some_pr_tuples:
        read_num = t[1]
        pos = t[0]
        ref_piece_num = utils.key_to_integer(ref[pos:pos+c.READ_SIZE])
        if read_num == ref_piece_num:
            good_areas[pos:pos+c.READ_SIZE] = True
        else:
            pass
        #good_areas[pos:pos+c.READ_SIZE] = True
        #else:
        #    nonperfect_pos_read_tuples.append(t)
        count += 1
        if count % 10000 == 0:
            print '{}'.format(count/len(some_pr_tuples))
            print '{}'.format(time.time() - start)
    print 'piece done in {}'.format((time.time() - start))
    return (good_areas, nonperfect_pos_read_tuples)