Esempio n. 1
0
def get_distinct_class_substrings(classification_list, first_int, second_int):
    list_ = []
    for element in classification_list:
        for class_ in th.tokenize_text(element):
            if class_[first_int:second_int] not in list_:
                list_.append(class_[first_int:second_int])
    list_.sort()
    return list_
Esempio n. 2
0
def get_class_substrings(classification_list, first_int, second_int):
    list_ = []
    for element in classification_list:
        string = ""
        for class_ in th.tokenize_text(element):
            if class_[first_int:second_int] not in string:
                string += class_[first_int:second_int] + ' '
        list_.append(string)
    return list_
Esempio n. 3
0
def shrink_classes(df, row, class_list):
    if isinstance(row, pd.Series):
        patent_id, text, class_ = row.tolist()
        new_classcodes = []
        classcodes = th.tokenize_text(class_)
        for classcode in classcodes:
            if not classcode in class_list:
                new_classcodes.append(classcode)
        if new_classcodes != []:
            new_class = ' '.join(el for el in new_classcodes)
            df.loc[df.shape[0] + 1] = [patent_id, text, new_class]
Esempio n. 4
0
def handle_row(row, ids_list):
    if isinstance(row, pd.Series):
        try:
            id_, patent_id, text, classcodes = row.tolist()
        except:
            patent_id, text, classcodes = row.tolist()
        tokens = th.tokenize_text(text)
        if len(tokens) < 2:
            ids_list.append(patent_id)
        else:
            if isinstance(classcodes, str):
                temp_classcodes = th.tokenize_text(classcodes)
                for class_ in temp_classcodes:
                    if len(class_) == 4:
                        if class_[0] in ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H'] and class_[1].isdigit() and class_[2].isdigit() and class_[3].isalpha():
                            pass
                        else:
                            ids_list.append(patent_id)
                            break
                    else:
                        ids_list.append(patent_id)
                        break
            else:
                print('not string')
def clean_text(text):
    text = remove_numbers(text)

    # stop words
    # pre-processing (NLTK)

    # remove punctuation and numbers word
    text = remove_puntuation(text)

    # normalizing step - convert to lower case
    text = th.to_lowercase(text)

    # split into tokens by white spaces
    # tokens = nltk.word_tokenize(text)
    tokens = th.tokenize_text(text)

    # # is it useful to tag the text?
    # tagged = nltk.pos_tag(tokens)
    # # identify named entities
    # entities = nltk.chunk.ne_chunk(tagged)

    # remove remaining tokens that are not alphabetic
    # words = remove_alphabetic(tokens)

    # remove stop words
    tokens = remove_alternative_stop_words(tokens)

    # lemmatization of words
    # lemmatized = lemmatization_algorithm(words_without_stops)

    # stemming of words
    tokens = krovetz_alternative_stemming_algorithm(tokens)
    # words = stemming_algorithm(words)
    # tokens = porter_stemming_algorithm_without_nltk(tokens)

    # drop words that do not have at least 4 occcurences
    # to_be_removed = remove_words_on_occurrences(stemmed, 4)
    # new_stemmed = list(set(stemmed) - set(to_be_removed))

    # drop words that start with numeric character
    # new_stemmed = remove_numeric_words(new_stemmed)

    # not do the mispelling
    # tokens = remove_word_on_length(tokens, 2)

    return tokens
Esempio n. 6
0
def further_preprocessing_phase(temp_data_frame):
    temp_data_frame['text'] = temp_data_frame['text'].apply(lambda text: th.tokenize_text(text) if text != None else '')
    # textlist = temp_data_frame['text'].to_numpy()
    textlist = temp_data_frame['text'].tolist()

    # if it raises an exeption could be the empty texts
    patent_dictionary = Dictionary(textlist)
    corpus = [patent_dictionary.doc2bow(text) for text in textlist]

    print('original dictionary size: ', len(patent_dictionary))

    vocab_tf={}
    for i in corpus:
        for item, count in dict(i).items():
            if item in vocab_tf:
                vocab_tf[item]+=int(count)
            else:
                vocab_tf[item] =int(count)

    remove_ids=[]
    no_of_ids_below_limit=0
    for id,count in vocab_tf.items():
        if count<=5:
            remove_ids.append(id)
    patent_dictionary.filter_tokens(bad_ids=remove_ids)

    patent_dictionary.filter_extremes(no_below=0)
    patent_dictionary.filter_n_most_frequent(30)

    print('parsed dictionary size: ', len(patent_dictionary))

    vocabulary = list(patent_dictionary.token2id.keys())

    ids_list = []
    data_frame = pd.DataFrame(columns=['patent_id', 'text', 'classification'])
    temp_data_frame.apply(lambda row : shrink_vocabulary(row, vocabulary, data_frame, ids_list), axis=1)
    print(len(ids_list))
    data_frame.set_index(data_frame['patent_id'], inplace=True)
    data_frame.drop(ids_list, axis=0, inplace=True)
    return data_frame
Esempio n. 7
0
def apply_vocabulary_processor(text):
    # Build vocabulary (similar to CountVectorizer)
    max_document_length = max([len(th.tokenize_text(x)) for x in text])
    vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length)
    ######## tensorflow/transform or tf.data
    return np.array(list(vocab_processor.fit_transform(text))), vocab_processor, len(vocab_processor.vocabulary_)
Esempio n. 8
0
def check_out_for_whitespaces(text):
    if isinstance(text, str):
        return ' '.join([element for element in th.tokenize_text(text) if len(element) > 2 and len(element) < 31])
Esempio n. 9
0
def get_sequential_layers_values(parameters):
    return [list(map(lambda token : token, th.tokenize_text(parameters[:-1])))]
Esempio n. 10
0
def handle_row(row):
    if len(th.tokenize_text(row[1][2])) > 1:
        return list(map(lambda item : handle_item(row, item), th.tokenize_text(row[1][2])))
    return [[row[1][1], row[1][2][0], row[1][0]]]
def organize_processed_patent(patent, dtd_version):
    new_patent = {}
    # if the patent does not have an ipc-classification it cannot be used for
    # classification and is therefore removed and no longer processed
    if ("classification-ipc" not in patent.keys() or
        "claims" not in patent.keys() or
        "description" not in patent.keys()):
        return None
    try:
        # go through all the values for each tag name of the patent
        for tag_name, values in patent.items():
            new_patent[tag_name] = []
            proccesed_values = []
            for val in values:
                # remove newline, empty and None entries
                if (type(val) != str or not re.match("(^\\n)", val)) and val is not None:
                    if re.match("^classification", tag_name) or tag_name == "references-cited":
                        val = re.sub("\s+?", "", val) # remove the whitespaces
                    proccesed_values.append(val)
                    new_patent[tag_name].append(val)
            # save each ipc-classification of the patent as a list of dictionaries. each dictionary containing
            # it's secition, class and subclass value
            if (tag_name == "classification-ipc"):
                if(dtd_version == 2):
                    for value in proccesed_values:
                        if not re.match("^[A-Z].*", value):
                            return None
                values_text=th.get_string_from_list(th.tokenize_text(th.get_string_from_list(new_patent[tag_name], '')),'')
                # values_text = "".join("".join(new_patent[tag_name]).split())
                new_patent[tag_name] = list(map(lambda x : {"section": x[0], "class": x[1:3], "subclass": x[3]}, re.findall("([A-H][0-9]{2}[A-Z][0-9]{2,4})", values_text)))

            # save each inventors of the patent as a dictionary containing: firstname,lastname,city,country
            if (tag_name == "inventors"):
                num_elements = len(new_patent[tag_name])
                if num_elements % 4 != 0:
                    num_elements = num_elements - (num_elements % 4)
                # new_patent[tag_name] = ", ".join(list(map(lambda i : new_patent[tag_name][i] + " " + new_patent[tag_name][i+1], range(0, num_elements, 4))))
                new_patent[tag_name] = th.get_string_from_list(list(map(lambda i : new_patent[tag_name][i] + " " + new_patent[tag_name][i+1], range(0, num_elements, 4))), ', ')

            # save each inventors of the patent as a dictionary containing: firstname,lastname,city,country
            if (tag_name == "references-cited"):
                new_patent[tag_name] = th.get_string_from_list(list(map(lambda element:element, new_patent[tag_name]))), ' ')
                # new_patent[tag_name] = " ".join(list(map(lambda element : element, new_patent[tag_name])))

            # tag names that don't have more than one value are changed from a list to a single value
            if (tag_name in ["invention-title", "classification-national-main", "patent-country", "patent-date", "patent-kind", "patent-doc-number"]):
                try:
                    new_patent[tag_name] = new_patent[tag_name][0]
                except:
                    new_patent[tag_name] = ''

            if (tag_name == "patent-lang"):
                new_patent[tag_name]=th.get_string_from_list(th.tokenize_text(th.get_string_from_list(new_patent[tag_name], '')),'')
                # new_patent[tag_name] = "".join("".join(new_patent[tag_name]).split())
        return new_patent
    except Exception as e:
        print("new error occurred - processsing patent. Error:", e)
        exc_type, exc_obj, exc_tb = sys.exc_info()
        fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
        print(exc_type, fname, exc_tb.tb_lineno)
        return None