def prepareData(data, max_len, dictionary, sufix_len , tags = True): dict_size = len(dictionary) if sufix_len > 0: data = pp.convertDataToSufixes(data, sufix_len) if tags: data = pp.addStartEndTagsToSeq(data) if tags: pp.checkIfTagsAreInDict(dictionary) data = pp.convertDataToDict(data, dictionary) print('After using dict, the size is {0}'.format(len(data))) data = pp.adjustLengthOfSeq(data, max_len) return data
def createDict(seq, dict_size, sufix_len): """ Creates dict of dict_size size from seq which are list of strings. """ logging.info("Creating dict...") seq = pp.convertDataToSufixes(seq, sufix_len) counter = Counter(" ".join(seq).split(" ")) overall_count = sum(counter.values()) most_common = counter.most_common(dict_size - 3) if dict_size - 3 != len(most_common): logging.error("Dict size error. There is not enough different words in training data to satisfay dict size. Dict size len is: " + str(len(most_common))) percent = round(float(sum([element[1] for element in most_common])) / overall_count * 100, 2) # how much does the dictionary cover dict_elements = [element[0] for element in most_common] dict_elements.append('<start>') dict_elements.append('<end>') dict_elements.append('<unk>') logging.info('Dict covered {0}% of all words'.format(percent)) return dict_elements