def get_distinct_class_substrings(classification_list, first_int, second_int): list_ = [] for element in classification_list: for class_ in th.tokenize_text(element): if class_[first_int:second_int] not in list_: list_.append(class_[first_int:second_int]) list_.sort() return list_
def get_class_substrings(classification_list, first_int, second_int): list_ = [] for element in classification_list: string = "" for class_ in th.tokenize_text(element): if class_[first_int:second_int] not in string: string += class_[first_int:second_int] + ' ' list_.append(string) return list_
def shrink_classes(df, row, class_list): if isinstance(row, pd.Series): patent_id, text, class_ = row.tolist() new_classcodes = [] classcodes = th.tokenize_text(class_) for classcode in classcodes: if not classcode in class_list: new_classcodes.append(classcode) if new_classcodes != []: new_class = ' '.join(el for el in new_classcodes) df.loc[df.shape[0] + 1] = [patent_id, text, new_class]
def handle_row(row, ids_list): if isinstance(row, pd.Series): try: id_, patent_id, text, classcodes = row.tolist() except: patent_id, text, classcodes = row.tolist() tokens = th.tokenize_text(text) if len(tokens) < 2: ids_list.append(patent_id) else: if isinstance(classcodes, str): temp_classcodes = th.tokenize_text(classcodes) for class_ in temp_classcodes: if len(class_) == 4: if class_[0] in ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H'] and class_[1].isdigit() and class_[2].isdigit() and class_[3].isalpha(): pass else: ids_list.append(patent_id) break else: ids_list.append(patent_id) break else: print('not string')
def clean_text(text): text = remove_numbers(text) # stop words # pre-processing (NLTK) # remove punctuation and numbers word text = remove_puntuation(text) # normalizing step - convert to lower case text = th.to_lowercase(text) # split into tokens by white spaces # tokens = nltk.word_tokenize(text) tokens = th.tokenize_text(text) # # is it useful to tag the text? # tagged = nltk.pos_tag(tokens) # # identify named entities # entities = nltk.chunk.ne_chunk(tagged) # remove remaining tokens that are not alphabetic # words = remove_alphabetic(tokens) # remove stop words tokens = remove_alternative_stop_words(tokens) # lemmatization of words # lemmatized = lemmatization_algorithm(words_without_stops) # stemming of words tokens = krovetz_alternative_stemming_algorithm(tokens) # words = stemming_algorithm(words) # tokens = porter_stemming_algorithm_without_nltk(tokens) # drop words that do not have at least 4 occcurences # to_be_removed = remove_words_on_occurrences(stemmed, 4) # new_stemmed = list(set(stemmed) - set(to_be_removed)) # drop words that start with numeric character # new_stemmed = remove_numeric_words(new_stemmed) # not do the mispelling # tokens = remove_word_on_length(tokens, 2) return tokens
def further_preprocessing_phase(temp_data_frame): temp_data_frame['text'] = temp_data_frame['text'].apply(lambda text: th.tokenize_text(text) if text != None else '') # textlist = temp_data_frame['text'].to_numpy() textlist = temp_data_frame['text'].tolist() # if it raises an exeption could be the empty texts patent_dictionary = Dictionary(textlist) corpus = [patent_dictionary.doc2bow(text) for text in textlist] print('original dictionary size: ', len(patent_dictionary)) vocab_tf={} for i in corpus: for item, count in dict(i).items(): if item in vocab_tf: vocab_tf[item]+=int(count) else: vocab_tf[item] =int(count) remove_ids=[] no_of_ids_below_limit=0 for id,count in vocab_tf.items(): if count<=5: remove_ids.append(id) patent_dictionary.filter_tokens(bad_ids=remove_ids) patent_dictionary.filter_extremes(no_below=0) patent_dictionary.filter_n_most_frequent(30) print('parsed dictionary size: ', len(patent_dictionary)) vocabulary = list(patent_dictionary.token2id.keys()) ids_list = [] data_frame = pd.DataFrame(columns=['patent_id', 'text', 'classification']) temp_data_frame.apply(lambda row : shrink_vocabulary(row, vocabulary, data_frame, ids_list), axis=1) print(len(ids_list)) data_frame.set_index(data_frame['patent_id'], inplace=True) data_frame.drop(ids_list, axis=0, inplace=True) return data_frame
def apply_vocabulary_processor(text): # Build vocabulary (similar to CountVectorizer) max_document_length = max([len(th.tokenize_text(x)) for x in text]) vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length) ######## tensorflow/transform or tf.data return np.array(list(vocab_processor.fit_transform(text))), vocab_processor, len(vocab_processor.vocabulary_)
def check_out_for_whitespaces(text): if isinstance(text, str): return ' '.join([element for element in th.tokenize_text(text) if len(element) > 2 and len(element) < 31])
def get_sequential_layers_values(parameters): return [list(map(lambda token : token, th.tokenize_text(parameters[:-1])))]
def handle_row(row): if len(th.tokenize_text(row[1][2])) > 1: return list(map(lambda item : handle_item(row, item), th.tokenize_text(row[1][2]))) return [[row[1][1], row[1][2][0], row[1][0]]]
def organize_processed_patent(patent, dtd_version): new_patent = {} # if the patent does not have an ipc-classification it cannot be used for # classification and is therefore removed and no longer processed if ("classification-ipc" not in patent.keys() or "claims" not in patent.keys() or "description" not in patent.keys()): return None try: # go through all the values for each tag name of the patent for tag_name, values in patent.items(): new_patent[tag_name] = [] proccesed_values = [] for val in values: # remove newline, empty and None entries if (type(val) != str or not re.match("(^\\n)", val)) and val is not None: if re.match("^classification", tag_name) or tag_name == "references-cited": val = re.sub("\s+?", "", val) # remove the whitespaces proccesed_values.append(val) new_patent[tag_name].append(val) # save each ipc-classification of the patent as a list of dictionaries. each dictionary containing # it's secition, class and subclass value if (tag_name == "classification-ipc"): if(dtd_version == 2): for value in proccesed_values: if not re.match("^[A-Z].*", value): return None values_text=th.get_string_from_list(th.tokenize_text(th.get_string_from_list(new_patent[tag_name], '')),'') # values_text = "".join("".join(new_patent[tag_name]).split()) new_patent[tag_name] = list(map(lambda x : {"section": x[0], "class": x[1:3], "subclass": x[3]}, re.findall("([A-H][0-9]{2}[A-Z][0-9]{2,4})", values_text))) # save each inventors of the patent as a dictionary containing: firstname,lastname,city,country if (tag_name == "inventors"): num_elements = len(new_patent[tag_name]) if num_elements % 4 != 0: num_elements = num_elements - (num_elements % 4) # new_patent[tag_name] = ", ".join(list(map(lambda i : new_patent[tag_name][i] + " " + new_patent[tag_name][i+1], range(0, num_elements, 4)))) new_patent[tag_name] = th.get_string_from_list(list(map(lambda i : new_patent[tag_name][i] + " " + new_patent[tag_name][i+1], range(0, num_elements, 4))), ', ') # save each inventors of the patent as a dictionary containing: firstname,lastname,city,country if (tag_name == "references-cited"): new_patent[tag_name] = th.get_string_from_list(list(map(lambda element:element, new_patent[tag_name]))), ' ') # new_patent[tag_name] = " ".join(list(map(lambda element : element, new_patent[tag_name]))) # tag names that don't have more than one value are changed from a list to a single value if (tag_name in ["invention-title", "classification-national-main", "patent-country", "patent-date", "patent-kind", "patent-doc-number"]): try: new_patent[tag_name] = new_patent[tag_name][0] except: new_patent[tag_name] = '' if (tag_name == "patent-lang"): new_patent[tag_name]=th.get_string_from_list(th.tokenize_text(th.get_string_from_list(new_patent[tag_name], '')),'') # new_patent[tag_name] = "".join("".join(new_patent[tag_name]).split()) return new_patent except Exception as e: print("new error occurred - processsing patent. Error:", e) exc_type, exc_obj, exc_tb = sys.exc_info() fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] print(exc_type, fname, exc_tb.tb_lineno) return None