def load_docs(matches_filepath, url_filepath, text_filepath, threshold): map_e2f = {} map_f2e = {} docs_dict = defaultdict(dict) with open(matches_filepath, 'r') as f_mapping: for line in f_mapping: score, e, f = line.strip().split('\t') if float(score) < float(threshold): continue map_e2f[e] = f map_f2e[f] = e with open_xz_or_gzip_or_plain( text_filepath) as f_text, open_xz_or_gzip_or_plain( url_filepath) as f_url: for line in f_text: text = line.strip() url = next(f_url, None).strip() if url in map_e2f: key = (url, map_e2f[url]) docs_dict[key]['en_text'] = text elif url in map_f2e: key = (map_f2e[url], url) docs_dict[key]['fr_text'] = text return docs_dict
def score(self, source_filepath, target_filepath): source_filepath = self.munge_file_path(source_filepath) target_filepath = self.munge_file_path(target_filepath) urls = [[], []] with open_xz_or_gzip_or_plain(source_filepath) as source_file: with open_xz_or_gzip_or_plain(target_filepath) as target_file: #start = time.time() self.vector_extractor.estimate_idf(source_file, target_file) #sys.stderr.write( # "IDF estimation took {0:.5f} seconds\n".format(time.time() - start)) #start = time.time() #Calculate tf and obtain tf-idf with urls with open_xz_or_gzip_or_plain(source_filepath) as source_file: urls[0], source_matrix = self.vector_extractor.extract( source_file, self.vector_extractor.ndocs_sl) with open_xz_or_gzip_or_plain(target_filepath) as target_file: urls[1], target_matrix = self.vector_extractor.extract( target_file, self.vector_extractor.ndocs_tl) #sys.stderr.write( # "Matrix extraction took {0:.5f} seconds\n".format(time.time() - start)) #sys.stderr.write(str(source_matrix)+"\n"+str(target_matrix)+"\n") #start = time.time() del self.vector_extractor if source_matrix.getnnz() == 0 or target_matrix.getnnz() == 0: d = None else: d = self.batched_pairwise_distances(source_matrix, target_matrix) #sys.stderr.write( # "Scoring took {0:.5f} seconds\n".format(time.time() - start)) return urls, d
def extract_urls(html_file, url_file, docs, fileid): with open_xz_or_gzip_or_plain(html_file) as hd: with open_xz_or_gzip_or_plain(url_file) as ud: for url in ud: html_content = base64.b64decode(next(hd, None)).decode("utf-8") links = re.findall('''href\s*=\s*['"]\s*([^'"]+)['"]''', html_content, re.S) docs[fileid] = [url, set(list(links))] fileid += 1 return fileid
def read(path): ret = [] with open_xz_or_gzip_or_plain(path) as reader: for line in reader: line = line.strip() ret.append(line) return ret
def extract_urls(html_file, url_file, docs): with open_xz_or_gzip_or_plain(html_file) as hd: with open_xz_or_gzip_or_plain(url_file) as ud: fileid = 1 for url in ud: html_content = base64.b64decode(next(hd, None)).decode("utf-8") links = re.findall('''href\s*=\s*['"]\s*([^'"]+)['"]''', html_content, re.S) rx = re.match('(https?://[^/:]+)', url) if rx is not None: url_domain = rx.group(1) urls = "".join(links).replace(url_domain, "") else: urls = "".join(links) docs[fileid] = urls fileid += 1
def extract_images(f, docs): with open_xz_or_gzip_or_plain(f) as fd: fileid = 1 for html_base64enc in fd: # To compute the edit distance at the level of characters, HTML tags must be encoded as characters and # not strings: links = re.findall( '''<img [^>]*src\s*=\s*['"]\s*([^'"]+)['"]''', base64.b64decode(html_base64enc.strip()).decode("utf-8"), re.S) docs[fileid] = set(list(links)) fileid += 1
def read_urls(f, docs): with open_xz_or_gzip_or_plain(f) as fd: fileid = 1 for u in fd: u = u.strip() rx = re.match('(https?://[^/:]+)', u) if rx is not None: url_domain = rx.group(1) url = u.replace(url_domain, "") else: url = u docs[fileid] = url fileid += 1
def load_extracted(filepath): filepath = munge_file_path(filepath) with open_xz_or_gzip_or_plain(filepath) as fextract: documents = defaultdict(list) for line in fextract: line_split = line.strip().split('\t', 1) if len(line_split) != 2: continue url, text = line_split documents[url].append(text) return {d: "\n".join(documents[d]) for d in documents}
def extract_structure_representations(f, docs): with open_xz_or_gzip_or_plain(f) as fd: fileid = 1 dic = {} charidx = 32 dic[''] = '_' for html_base64enc in fd: p = Parser() try: e = base64.b64decode(html_base64enc.strip()).decode("utf8") if e != "": p.feed(e) raspa = "".join(p.output) if raspa.split('_')[1][-2:] == "ml" and all( ord(char) < 128 for char in raspa): # Delete entries without *ml in the first # tag to avoid things different than HTML or XML as JPGS or PDF, for example. To compute the # edit distance at the level of characters, HTML tags must be encoded as characters and not # strings: taglist = raspa.split('_') tagset = set(taglist) if '' in tagset: tagset.remove('') # Adding new tags in the raspa and the character with which they will be replaced to the # dictionary for tag in tagset: if tag not in dic: dic[tag] = chr(charidx) charidx += 1 if charidx == 95: charidx += 1 translated_taglist = [] for tag in taglist: translated_taglist.append(dic[tag]) docs[fileid] = "".join(translated_taglist) else: docs[fileid] = " " except html.parser.HTMLParseError: pass finally: fileid += 1
help='File containing the plain text extracted from the HTML documents in a WARC file, ' 'encoded in base64') parser.add_argument("--splitter", dest="splitter", default="", help="Sentence splitting command") parser.add_argument("--tokenized", dest="tokenized", action="store_true", help='Don\'t apply sentence splitter to the text (split by newlines only).') parser.add_argument("--output_prefix", dest="output_prefix", default="", required=False, help="Prefix for output files within directory") parser.add_argument("--prune", dest="prune_threshold", type=int, default=80, help="Prune sentences longer than n (words/characters)", required=False) parser.add_argument("--prune_type", dest="prune_type", choices={"words", "chars"}, default="words", help="Prune sentences either by words or characters", required=False) args = parser.parse_args() counter = 1 with open_xz_or_gzip_or_plain(args.text_file) as text_reader: for line in text_reader: text = base64.b64decode(line.strip()).decode("utf-8") if not text: continue if args.tokenized: split = split_sentences(text, None) else: split = split_sentences(text, args.splitter) for extracted_line in split: extracted_line = extracted_line.strip() if not extracted_line:
lang_file = {} for l in langs_parse: if not l.strip(): continue if args.xz: lang_file[l] = lzma.open( os.path.join( args.output_dir, "{0}{1}.extracted.xz".format(args.output_prefix, l)), "wb") else: lang_file[l] = gzip.open( os.path.join( args.output_dir, "{0}{1}.extracted.gz".format(args.output_prefix, l)), "wb") with open_xz_or_gzip_or_plain(args.textFile) as text_reader, \ open_xz_or_gzip_or_plain(args.langFile) as lang_reader, \ open_xz_or_gzip_or_plain(args.urlFile) as url_reader: for line in text_reader: text = base64.b64decode(line.strip()).decode("utf-8") lang = next(lang_reader, None).strip() uri = next(url_reader, None).strip() if lang not in langs_parse: continue if not text: continue for extracted_line in split_sentences(text, args.splitter): extracted_line = extracted_line.strip()
return b64text.decode() oparser = argparse.ArgumentParser( description= "Tool that tokenizes (sentences, tokens and morphemes) plain text") oparser.add_argument('--text', dest='text', help='Plain text file', required=True) oparser.add_argument('--sentence-splitter', dest='splitter', required=True, help="Sentence splitter commands") oparser.add_argument('--word-tokenizer', dest='tokenizer', required=True, help="Word tokenisation command") oparser.add_argument('--morph-analyser', dest='lemmatizer', help="Morphological analyser command") options = oparser.parse_args() with open_xz_or_gzip_or_plain(options.text) as reader: for line in reader: encoded_text = line.strip() tokenized = extract_encoded_text(encoded_text, options.splitter, options.tokenizer, options.lemmatizer) print(tokenized)
def text2prevertical(text_files, url_files, langs, langs_likelihood, boilerplate_likelihood, min_lang_diff_likelihood, seed=-1): random.seed(seed if seed >= 0 else None) for text_file, url_file in zip(text_files, url_files): with open_xz_or_gzip_or_plain( text_file) as text_fd, open_xz_or_gzip_or_plain( url_file) as url_fd: for doc_idx, (doc, url) in enumerate(zip(text_fd, url_fd), 1): doc = doc.strip() url = url.strip().replace('\t', ' ') content = "" prevertical_content = "" current_date = datetime.now().strftime("%Y/%m/%d %H:%M") try: content = base64.b64decode(doc.strip()).decode("utf-8") except UnicodeDecodeError: logging.warning( "unicode decoding error while processing doc #%d", doc_idx) continue title = "this is the title" # TODO is it the length format correct? length = f"{len(content)}" if len( content) < 1024 else f"{len(content) // 1024}k" lang = roulette_wheel_selection(langs, langs_likelihood) lang_diff = f"{min(random.uniform(min_lang_diff_likelihood, 1.00005), 1.0):.2f}" ip = f"{random.randint(0, 256)}.{random.randint(0, 256)}.{random.randint(0, 256)}.{random.randint(0, 256)}" # Greedy document header prevertical_content += f"<doc id=\"{doc_idx}\" title=\"{title}\"" \ f" length=\"{length}\" crawl_date=\"{current_date}\"" \ f" lang=\"{lang}\" lang_diff=\"{lang_diff}\"" \ f" ip=\"{ip}\" url=\"{url}\" file_type=\"html\"" \ f" enc_meta=\"utf-8\" enc_chared=\"utf-8\">\n" paragraphs = content.strip().replace('\t', ' ').split('\n') printed_paragraphs = 0 for paragraph in paragraphs: paragraph = paragraph.strip() if paragraph == '': continue # Escape HTML entities which might be harmful for XML # don't escape '&' since we might escape twice previous escaped HTML entities paragraph = paragraph.replace('<', '<') \ .replace('>', '>') \ .replace('\'', ''') \ .replace('"', '"') lang_diff = f"{min(random.uniform(min_lang_diff_likelihood, 1.00005), 1.0):.2f}" boilerplate = "good" if random.random( ) > boilerplate_likelihood else "bad" # Greedy paragraph header prevertical_paragraph = f"<p class=\"{boilerplate}\" cfclass=\"{boilerplate}\" langdiff=\"{lang_diff}\">\n" prevertical_paragraph += paragraph prevertical_paragraph += "\n</p>\n" prevertical_content += prevertical_paragraph printed_paragraphs += 1 prevertical_content += "</doc>" if printed_paragraphs != 0: print(prevertical_content)
lang2_read_docs = {} if args.indices[:-3] == '.xz': reader = lzma.open(args.indices, 'rt') elif args.indices[:-3] == '.gz': reader = gzip.open(args.indices, 'rt') else: reader = open(args.indices, 'r') for line in reader: fields = line.split('\t') lang2_docs.add(int(fields[args.column2])) reader.seek(0) with open_xz_or_gzip_or_plain(args.tokenized1) as tok_reader1, \ open_xz_or_gzip_or_plain(args.tokenized2) as tok_reader2, \ open_xz_or_gzip_or_plain(args.text1) as text_reader1, \ open_xz_or_gzip_or_plain(args.text2) as text_reader2: doc1_current_line = 1 doc2_current_line = 1 doc2_last_written = 0 for line in reader: fields = line.strip().split('\t') doc1 = int(fields[args.column1]) doc2 = int(fields[args.column2]) while doc1_current_line <= doc1: text1 = next(text_reader1, None).strip() tok1 = next(tok_reader1, None).strip()
type=int, default=80, help="Prune sentences longer than n (words/characters)", required=False) parser.add_argument("--prune_type", dest="prune_type", choices={"words", "chars"}, default="words", help="Prune sentences either by words or characters", required=False) args = parser.parse_args() counter = 0 if args.tok_file: with open_xz_or_gzip_or_plain(args.sent_file) as sent_reader, open_xz_or_gzip_or_plain(args.tok_file) as tok_reader, \ open_xz_or_gzip_or_plain(args.out_extracted, "wt") as sent_writer, open_xz_or_gzip_or_plain(args.out_tokenized, "wt") as tok_writer: for sent_doc in sent_reader: counter = counter + 1 tok_doc = next(tok_reader, None) sent_text = base64.b64decode(sent_doc.strip()).decode("utf-8") tok_text = base64.b64decode(tok_doc.strip()).decode("utf-8") for sent_line, tok_line in zip(sent_text.split("\n"), tok_text.split("\n")): sent_line = sent_line.strip() tok_line = tok_line.strip() if not sent_line or not tok_line: continue
parser.add_argument( "--sentences_file", dest="sent_file", default='-', help= 'File containing the sentence splitted plain text extracted from the HTML documents ' 'in a WARC file, encoded in base64') args = parser.parse_args() counter = 0 if not args.sent_file: args.sent_file = '-' with open_xz_or_gzip_or_plain( args.sent_file ) if args.sent_file != '-' else sys.stdin as sent_reader: for line in sent_reader: counter = counter + 1 text = base64.b64decode(line.strip()).decode("utf-8") n_lines = 0 for extracted_line in text.split("\n"): extracted_line = extracted_line.strip() if not extracted_line: continue n_lines = n_lines + 1 print(f'{counter}\t{extracted_line}') if n_lines == 0: print(
def main(args): lett_file = args.lett_file langs = args.langs output_prefix = args.output_prefix preprocess_tool = args.preprocess_tool if not os.path.isfile(lett_file): raise Exception("LETT file is not a file") if not os.path.isdir(output_prefix): raise Exception("Output prefix is not a dir") output_files = {} number_output_files_open = {} preprocess_files = ("text", "url", "mime", "html") with open_xz_or_gzip_or_plain(lett_file) as lett: for idx, line in enumerate(lett, 1): line = line.rstrip('\n').split('\t') if len(line) != 6: logging.warning( "Line %d: unexpected number of columns: %d vs 6", idx, len(line)) lang, mime, encoding, url, base64_html, base64_text = line tsd, td, tsu = extract(url) if len(langs) == 0 or lang in langs: # Write record open_files = lang not in output_files # Create necessary idx per domain if lang not in number_output_files_open: number_output_files_open[lang] = {} number_output_files_open[lang][td] = 0 open_files = True elif td not in number_output_files_open[lang]: number_output_files_open[lang][td] = len( number_output_files_open[lang]) open_files = True if open_files: prefix = f"{output_prefix}/{number_output_files_open[lang][td]}/{preprocess_tool}" if not os.path.isdir(prefix): os.makedirs(prefix) if lang not in output_files: output_files[lang] = {} # Open files output_files[lang][td] = { f: gzip.open(f"{prefix}/{f}.gz", "wb") for f in preprocess_files } # Write output_files[lang][td]["text"].write( base64_text.encode("utf-8")) output_files[lang][td]["url"].write(url.encode("utf-8")) output_files[lang][td]["mime"].write(mime.encode("utf-8")) output_files[lang][td]["html"].write( base64_html.encode("utf-8")) for preprocess_file in preprocess_files: output_files[lang][td][preprocess_file].write(b'\n') # Close files for lang in output_files: for td in output_files[lang]: for ft in output_files[lang][td]: output_files[lang][td][ft].close()