def parse(): """ Parses data directory for citations. """ data = parser.search_report_data() for report in data: for entry in report: base_patent = entry['patnum_root'] parse_csv(config.data_dir + '/' + base_patent + '/' + base_patent + '.export.csv')
def lines(patnum, pdf): filename = pdf.replace(".pdf", ".txt") data = parser.search_report_data(config.data_dir + patnum, filename) lines = [] for reportfile in data: for entry in reportfile: line = entry['country'] + '.' + entry['patnum'] + '.' + entry['kindcode'] if _has_xml(patnum, entry['country'] + '.' + entry['patnum']): line += ' <span class="xml-available">(XML available)</span>' lines.append(line) for ref in entry['refs']: lines.append(ref) return dict(references=lines, patnum=patnum)
def parse(epo = False, bz2 = False, clef=False): data = parser.search_report_data() for report in data: for entry in report: cited_patent = entry['country'] + '.' + entry['patnum'] base_patent = entry['patnum_root'] if bz2: download_from_bz2_marec(base_patent, cited_patent) elif epo: download_from_epo(base_patent, cited_patent) elif clef: download_from_clef(base_patent, cited_patent) else: download_from_marec(base_patent, cited_patent)
def scan(restrict, restrict_cited): """ Scan data dir for search reports and parses them for citations. """ data = parser.search_report_data() for report in data: for entry in report: refs = entry['refs'] doc = entry['country'] + "." + entry['patnum'] patnum = entry['patnum_root'] if restrict and patnum != restrict: continue for ref in refs: if ref: if restrict_cited and doc != restrict_cited: continue print "processing ", patnum, doc, ref process(patnum, doc, ref)
download.get_pdf_page(country, patnum, kindcode, page + offset, target_folder, "column-" + str(column)) return kindcode, startpage def _get_whitelist(): whitelist = [] try: for line in open('whitelist.txt', 'r').readlines(): line = line.replace('\n', '') whitelist.append(line) except: whitelist = False return whitelist data = parser.search_report_data() whitelist = _get_whitelist() open('perf.cited.txt', 'w').write('') dl = [] for reportfile in data: for entry in reportfile: patnum = entry['patnum'] country = entry['country'] to_download = entry['country'] + '.' + patnum if country == 'WO': to_download = detected.normalize_wo_patnum(patnum)
from base import parser testdata = parser.search_report_data('testdata/') data = parser.search_report_data() def find(key, value, data): found = False for set in data: if set[key] == value: found = True return found for set in testdata: for property in set: patnum = property['patnum'] country = property['country'] kindcode = property['kindcode'] print find('patnum', patnum, set) print find('country', country, set) print find('kindcode', kindcode, set)