Ejemplo n.º 1
0
def parse_file_fasta_seqkey(file_fasta, hsh, options):
    if options.get('-a') == '':
        print_stderr('reading file into hash\n')

    _id = ''
    seq = ''
    running_1 = 0

    FASTA = open_or_die2(file_fasta, 'rb')

    while True:
        l = FASTA.readline().strip()
        if not l:
            break

        m = re.match(r'^>(\S+)', l)
        if m:
            _id = m.group()
            seq = ''

            while True:
                ll = FASTA.readline().strip()
                if not ll:
                    break

                mm = re.match(r'^>(\S+)', ll)
                if mm:
                    cnt = find_cnt(_id)
                    seq = tr(seq, '[acgtun.]', '[ACGTTNN]')
                    # ATTR: Performance issue below:
                    # create_hash_key_chain(hsh, 0, seq)
                    try:
                        hsh[seq] = (hsh[seq]) + cnt
                    except KeyError:
                        hsh[seq] = cnt

                    running_1 += 1

                    if options.get('-a') == '':
                        print_stderr('{}\r'.format(running_1))

                    _id = mm.group()
                    seq = ''
                    continue

                seq += ll

    cnt = find_cnt(_id)
    seq = tr(seq, '[acgtun.]', '[ACGTTNN]')
    create_hash_key_chain(hsh, 0, seq)
    hsh[seq] += cnt
    running_1 += 1

    if options.get('-a') == '':
        print_stderr('{}\r'.format(running_1))

    FASTA.close()
Ejemplo n.º 2
0
def read_handler(handle):
    global counter

    while True:
        rin = handle.readline().strip()
        if not rin:
            break

        counter += 1
        m = re.match(r'^\>(.+)$', rin)
        if m:
            m = m.groups()
            _id = m[0]

            if re.search(r'\s+', _id):
                die('Error in line {}: The identifier\n {}\n\ncontains white spaces\n\n{}\n\nYou could run remove_white_space_in_id.py inputfile > newfile\nThis will remove everything from the id line after the first whitespace\n'
                    .format(Nicenumber(counter), _id, hint))
            else:
                create_hash_key_chain(hash_num, 0, _id)
                hash_num[_id] += 1
        elif not re.match(r'^([A|C|G|T|U|N|a|c|g|t|u|n]+)$', rin):
            die('Error in line {}: The sequence\n{}\n\ncontains characters others than [acgtunACGTUN]\n\n{}'
                .format(Nicenumber(counter), rin, hint))
Ejemplo n.º 3
0
def resolve_entry_file_mrd_permuted(score, permutation, refs, _hash, options):
    if permutation is None:
        print_stderr('The {} file is not properly formatted.\nMaybe it does not contain the lines with \"permutation int\"?\n'.format(
            options.get('-a')
        ))
        sys.exit(0)

    floor = int(math.floor(score))
    create_hash_key_chain(_hash, 0, 'total', permutation, floor)
    _hash['total'][permutation][floor] += 1

    if refs:
        create_hash_key_chain(_hash, 0, 'known', permutation, floor)
        _hash['known'][permutation][floor] += 1
    else:
        create_hash_key_chain(_hash, 0, 'novel', permutation, floor)
        _hash['novel'][permutation][floor] += 1

    for i in range(len(refs)):
        refs.pop()
Ejemplo n.º 4
0
def resolve_entry_file_mrd(score, refs, _hash):
    global hash_sig
    floor = int(math.floor(score))

    create_hash_key_chain(_hash, 0, 'total', floor)
    _hash['total'][floor] += 1

    if refs:
        create_hash_key_chain(_hash, 0, 'known', floor)
        _hash['known'][floor] += 1

        for ref in refs:
            if hash_sig[ref] < floor:
                hash_sig[ref] = floor
    else:
        create_hash_key_chain(_hash, 0, 'novel', floor)
        _hash['novel'][floor] += 1

    for i in range(len(refs)):
        refs.pop()  # clear refs array
def insertfeature(db, strand, db_beg, db_end, freq):
    global hash_pos
    create_hash_key_chain(hash_pos, 0, db, strand, db_beg, db_end)
    hash_pos[db][strand][db_beg][db_end] += freq
Ejemplo n.º 6
0
def parse_file_arf(file_arf, options):
    global running, gscan, hash_edits
    FILE_ARF = open_or_die(file_arf, 'rb',
                           'can not open {}\n'.format(file_arf))
    while True:
        line = FILE_ARF.readline()
        if not line:
            break

        m = re.match(
            r'^(\S+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)',
            line)
        if m:
            m = m.groups()
            query = m[0]
            query_map_lng = int(m[1])
            query_beg = m[2]
            query_end = int(m[3])
            query_seq = m[4]
            db = m[5]
            db_map_lng = int(m[6])
            db_beg = m[7]
            db_end = int(m[8])
            db_seq = m[9]
            strand = m[10]
            edits = int(m[11])
            edit_string = m[12]

            running += 1
            if options.get('-j') == '':
                (query_map_lng, query_end, query_seq, db_map_lng, db_end,
                 db_seq, edits, edit_string) = remove_trailing_nts(
                     query_map_lng, query_end, query_seq, db_map_lng, db_end,
                     db_seq, edits, edit_string)

            if '-a' in options.keys() and int(options.get('-a')) < edits:
                continue

            if options.get('-b') and query_map_lng < int(options.get('-b')):
                continue

            if options.get('-c') and int(options.get('-c')) < query_map_lng:
                continue

            if options.get('-d') and query not in hash_queries_incl.keys():
                continue

            if options.get('-e') and query in hash_queries_excl.keys():
                continue

            if options.get('-f') and db not in hash_dbs_incl.keys():
                continue

            if options.get('-g') and db in hash_dbs_excl.keys():
                continue

            if not (options.get('-h') == '' or options.get('-i')):
                pprint('{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.
                       format(query, query_map_lng, query_beg, query_end,
                              query_seq, db, db_map_lng, db_beg, db_end,
                              db_seq, strand, edits, edit_string))
                continue

            if gscan:
                create_hash_key_chain(hash_edits, 0, query, edits)
                hash_edits[query][edits] += 1
            else:
                evaluation = evaluate_query(query, edits, options)
                if evaluation:
                    pprint(
                        "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".
                        format(query, query_map_lng, query_beg, query_end,
                               query_seq, db, db_map_lng, db_beg, db_end,
                               db_seq, strand, edits, edit_string))
def parse_file_arf(file_arf):
    '''
    read through the signature blastparsed file, fills up a hash with information on queries
    (deep sequences) mapping to the current db (potential precursor) and resolve each
    potential precursor in turn
    '''
    global db_old, hash_query
    FILENAME = open_or_die(file_arf, 'rb',
                           'could not open file {}\n'.format(file_arf))
    while True:
        line = FILENAME.readline()
        if not line:
            break

        m = re.match(
            r'^(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)',
            line)
        if m:
            m = m.groups()

            query = m[0]
            query_lng = int(m[1])
            query_beg = int(m[2])
            query_end = int(m[3])
            query_seq = m[4]
            db = m[5]
            db_lng = int(m[6])
            db_beg = int(m[7])
            db_end = int(m[8])
            db_seq = m[9]
            strand = m[10]
            edits = int(m[11])
            edit_string = m[12]

            # only reads that map sense to the potential precursor are
            # considered
            if strand == "-":
                continue

            # if the new line concerns a new db (potential precursor) then the
            # old db must be resolved
            if db_old and db_old != db:
                # print(db, db_old)
                resolve_potential_precursor()

            # resolve the number of reads that the deep sequence represents
            freq = find_freq(query)

            # read information of the query (deep sequence) into hash
            create_hash_key_chain(hash_query, db_beg, query, 'db_beg')
            create_hash_key_chain(hash_query, db_end, query, 'db_end')
            create_hash_key_chain(hash_query, strand, query, 'strand')
            create_hash_key_chain(hash_query, freq, query, 'freq')

            hash_query[query]["db_beg"] = db_beg
            hash_query[query]["db_end"] = db_end
            hash_query[query]["strand"] = strand
            hash_query[query]["freq"] = freq

            db_old = db

    resolve_potential_precursor()