Ejemplo n.º 1
0
def prepareData(data, max_len, dictionary, sufix_len , tags = True):
    dict_size = len(dictionary)
    if sufix_len > 0:
        data = pp.convertDataToSufixes(data, sufix_len)
    if tags:
        data = pp.addStartEndTagsToSeq(data)
    if tags:
        pp.checkIfTagsAreInDict(dictionary)
    data = pp.convertDataToDict(data, dictionary)
    print('After using dict, the size is {0}'.format(len(data)))
    data = pp.adjustLengthOfSeq(data, max_len)
    return data
Ejemplo n.º 2
0
def createDict(seq, dict_size, sufix_len):
    """
    Creates dict of dict_size size from seq which are list of strings.
    """
    logging.info("Creating dict...")
    seq = pp.convertDataToSufixes(seq, sufix_len)
    counter = Counter(" ".join(seq).split(" "))
    overall_count = sum(counter.values())
    most_common = counter.most_common(dict_size - 3)

    if dict_size - 3 != len(most_common):
        logging.error("Dict size error. There is not enough different words in training data to satisfay dict size. Dict size len is: " + str(len(most_common)))

    percent = round(float(sum([element[1] for element in most_common])) / overall_count * 100, 2) # how much does the dictionary cover
    dict_elements = [element[0] for element in most_common]
    dict_elements.append('<start>')
    dict_elements.append('<end>')
    dict_elements.append('<unk>')
    logging.info('Dict covered {0}% of all words'.format(percent))
    return dict_elements