def get_alternative_classcode(patent_document): classcode = "" sdobi = patent_document[0] classcode = list( map(lambda node: th.handle_class_node(node, 0, 4, 'text'), sdobi.iter('classification-ipcr'))) if classcode: if None in classcode: classcode.remove(None) if "" in classcode: classcode.remove("") classcode = th.get_flat_list(classcode) classcode = th.unique_list(classcode) classcode = classcode[:4] classcode = th.get_string_from_list(classcode, " ") if classcode == "": classcode = list( map(lambda sub_node: th.handle_class_node(sub_node, 1, 5, 'text'), sdobi.iter('B510'))) if classcode: if None in classcode: classcode.remove(None) if "" in classcode: classcode.remove("") classcode = th.get_flat_list(classcode) classcode = th.unique_list(classcode) classcode = classcode[:4] classcode = th.get_string_from_list(classcode, " ") return classcode
def get_alternative_text(patent_document, marker): text = list(map(lambda sub_node : th.get_node_value(sub_node), filter(lambda node: node.tag == marker, patent_document))) if text: if None in text: text.remove(None) return th.get_string_from_list(text, " ") return ""
def get_alternative_abstract(node): text = list(map(lambda abst: get_nested_text(abst), node)) if text: if None in text: text.remove(None) return th.get_string_from_list(text, " ") return ""
def get_alternative_description(node): text = list(map(lambda desc: get_nested_text(desc), node)) if text: if None in text: text.remove(None) return th.get_string_from_list(text, " ") return ""
def txt_basic_information(file, kind, classcode, applicant, abstract): file.write(kind + "\n") file.write(classcode + "\n") file.write(applicant + "\n") if abstract != None: file.write(th.get_string_from_list(abstract, ' ') + "\n") else: file.write("\n")
def get_alternative_claim(node): text = list( map(lambda claim: get_nested_text(claim), node.iter('claim-text'))) if text: if None in text: text.remove(None) return get_claim_type(node.attrib), th.get_string_from_list(text, " ") return get_claim_type(node.attrib), ""
def get_alternative_applicant(patent_document): applicant = "" sdobi = patent_document[0] text = list( map(lambda node: th.handle_ending_node(node, 'snm'), sdobi.iter('B711'))) if text: if None in text: text.remove(None) text = th.get_flat_list(text) return th.get_string_from_list(text, ",") return ""
def get_alternative_title(node): text = list( map( lambda sub_node: th.get_node_value(sub_node.getnext()), filter( lambda sub_node: th.get_node_value(sub_node).upper() == 'EN', node.iter('B541')))) if text: if None in text: text.remove(None) return th.get_string_from_list(text, " ") return ""
def get_alternative_citations(patent_document): citations = "" sdobi = patent_document[0] citations = list( map(lambda node: th.handle_citation_node(node, 'B565EP', 'B561'), sdobi.iter('B560'))) if citations: if None in citations: citations.remove(None) citations = th.get_flat_super_list(citations) citations = th.unique_list(citations) return th.get_string_from_list(citations, " ") return ""
def patent_classifications(patent): classifications = "" try: classifications = list(map(lambda t_classification : t_classification["section"]+t_classification["class"]+t_classification["subclass"], patent["classification-ipc"])) if classifications: classifications = np.unique(classifications) # return classifications, patent["classification-national-main"] # alternative return th.get_string_from_list(classifications, ' '), patent["classification-national-main"] return None, patent["classification-national-main"] except: return None, patent["classification-national-main"]
def get_alternative_country(patent_document): country = "" sdobi = patent_document[0] countries = list( map(lambda node: th.handle_ending_node(node, 'ctry'), sdobi.iter('B330'))) if countries: if None in countries: countries.remove(None) countries = th.get_flat_list(countries) countries = th.unique_list(countries) return th.get_string_from_list(countries, " ") return ""
def txt_text_information(file, claim, description): file.write(th.get_string_from_list(claim, ' ') + "\n") file.write(th.get_string_from_list(description, ' ') + "\n")
def organize_processed_patent(patent, dtd_version): new_patent = {} # if the patent does not have an ipc-classification it cannot be used for # classification and is therefore removed and no longer processed if ("classification-ipc" not in patent.keys() or "claims" not in patent.keys() or "description" not in patent.keys()): return None try: # go through all the values for each tag name of the patent for tag_name, values in patent.items(): new_patent[tag_name] = [] proccesed_values = [] for val in values: # remove newline, empty and None entries if (type(val) != str or not re.match("(^\\n)", val)) and val is not None: if re.match("^classification", tag_name) or tag_name == "references-cited": val = re.sub("\s+?", "", val) # remove the whitespaces proccesed_values.append(val) new_patent[tag_name].append(val) # save each ipc-classification of the patent as a list of dictionaries. each dictionary containing # it's secition, class and subclass value if (tag_name == "classification-ipc"): if(dtd_version == 2): for value in proccesed_values: if not re.match("^[A-Z].*", value): return None values_text=th.get_string_from_list(th.tokenize_text(th.get_string_from_list(new_patent[tag_name], '')),'') # values_text = "".join("".join(new_patent[tag_name]).split()) new_patent[tag_name] = list(map(lambda x : {"section": x[0], "class": x[1:3], "subclass": x[3]}, re.findall("([A-H][0-9]{2}[A-Z][0-9]{2,4})", values_text))) # save each inventors of the patent as a dictionary containing: firstname,lastname,city,country if (tag_name == "inventors"): num_elements = len(new_patent[tag_name]) if num_elements % 4 != 0: num_elements = num_elements - (num_elements % 4) # new_patent[tag_name] = ", ".join(list(map(lambda i : new_patent[tag_name][i] + " " + new_patent[tag_name][i+1], range(0, num_elements, 4)))) new_patent[tag_name] = th.get_string_from_list(list(map(lambda i : new_patent[tag_name][i] + " " + new_patent[tag_name][i+1], range(0, num_elements, 4))), ', ') # save each inventors of the patent as a dictionary containing: firstname,lastname,city,country if (tag_name == "references-cited"): new_patent[tag_name] = th.get_string_from_list(list(map(lambda element:element, new_patent[tag_name]))), ' ') # new_patent[tag_name] = " ".join(list(map(lambda element : element, new_patent[tag_name]))) # tag names that don't have more than one value are changed from a list to a single value if (tag_name in ["invention-title", "classification-national-main", "patent-country", "patent-date", "patent-kind", "patent-doc-number"]): try: new_patent[tag_name] = new_patent[tag_name][0] except: new_patent[tag_name] = '' if (tag_name == "patent-lang"): new_patent[tag_name]=th.get_string_from_list(th.tokenize_text(th.get_string_from_list(new_patent[tag_name], '')),'') # new_patent[tag_name] = "".join("".join(new_patent[tag_name]).split()) return new_patent except Exception as e: print("new error occurred - processsing patent. Error:", e) exc_type, exc_obj, exc_tb = sys.exc_info() fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] print(exc_type, fname, exc_tb.tb_lineno) return None