Beispiel #1
0
def load_docs(matches_filepath, url_filepath, text_filepath, threshold):
    map_e2f = {}
    map_f2e = {}

    docs_dict = defaultdict(dict)

    with open(matches_filepath, 'r') as f_mapping:
        for line in f_mapping:
            score, e, f = line.strip().split('\t')
            if float(score) < float(threshold):
                continue

            map_e2f[e] = f
            map_f2e[f] = e

    with open_xz_or_gzip_or_plain(
            text_filepath) as f_text, open_xz_or_gzip_or_plain(
                url_filepath) as f_url:
        for line in f_text:
            text = line.strip()
            url = next(f_url, None).strip()

            if url in map_e2f:
                key = (url, map_e2f[url])
                docs_dict[key]['en_text'] = text

            elif url in map_f2e:
                key = (map_f2e[url], url)
                docs_dict[key]['fr_text'] = text

    return docs_dict
Beispiel #2
0
    def score(self, source_filepath, target_filepath):
        source_filepath = self.munge_file_path(source_filepath)
        target_filepath = self.munge_file_path(target_filepath)
        urls = [[], []]

        with open_xz_or_gzip_or_plain(source_filepath) as source_file:
            with open_xz_or_gzip_or_plain(target_filepath) as target_file:
                #start = time.time()
                self.vector_extractor.estimate_idf(source_file, target_file)
                #sys.stderr.write(
                #    "IDF estimation took {0:.5f} seconds\n".format(time.time() - start))

        #start = time.time()
        #Calculate tf and obtain tf-idf with urls
        with open_xz_or_gzip_or_plain(source_filepath) as source_file:
            urls[0], source_matrix = self.vector_extractor.extract(
                source_file, self.vector_extractor.ndocs_sl)
        with open_xz_or_gzip_or_plain(target_filepath) as target_file:
            urls[1], target_matrix = self.vector_extractor.extract(
                target_file, self.vector_extractor.ndocs_tl)
        #sys.stderr.write(
        #    "Matrix extraction took {0:.5f} seconds\n".format(time.time() - start))
        #sys.stderr.write(str(source_matrix)+"\n"+str(target_matrix)+"\n")
        #start = time.time()
        del self.vector_extractor

        if source_matrix.getnnz() == 0 or target_matrix.getnnz() == 0:
            d = None
        else:
            d = self.batched_pairwise_distances(source_matrix, target_matrix)

        #sys.stderr.write(
        #    "Scoring took {0:.5f} seconds\n".format(time.time() - start))
        return urls, d
Beispiel #3
0
def extract_urls(html_file, url_file, docs, fileid):
    with open_xz_or_gzip_or_plain(html_file) as hd:
        with open_xz_or_gzip_or_plain(url_file) as ud:
            for url in ud:
                html_content = base64.b64decode(next(hd, None)).decode("utf-8")
                links = re.findall('''href\s*=\s*['"]\s*([^'"]+)['"]''', html_content, re.S)
                docs[fileid] = [url, set(list(links))]
                fileid += 1
    return fileid
Beispiel #4
0
def read(path):
    ret = []
    with open_xz_or_gzip_or_plain(path) as reader:
        for line in reader:
            line = line.strip()
            ret.append(line)
    return ret
def extract_urls(html_file, url_file, docs):
    with open_xz_or_gzip_or_plain(html_file) as hd:
        with open_xz_or_gzip_or_plain(url_file) as ud:
            fileid = 1
            for url in ud:
                html_content = base64.b64decode(next(hd, None)).decode("utf-8")

                links = re.findall('''href\s*=\s*['"]\s*([^'"]+)['"]''',
                                   html_content, re.S)
                rx = re.match('(https?://[^/:]+)', url)
                if rx is not None:
                    url_domain = rx.group(1)
                    urls = "".join(links).replace(url_domain, "")
                else:
                    urls = "".join(links)
                docs[fileid] = urls
                fileid += 1
Beispiel #6
0
def extract_images(f, docs):
    with open_xz_or_gzip_or_plain(f) as fd:
        fileid = 1
        for html_base64enc in fd:
            # To compute the edit distance at the level of characters, HTML tags must be encoded as characters and
            # not strings:
            links = re.findall(
                '''<img [^>]*src\s*=\s*['"]\s*([^'"]+)['"]''',
                base64.b64decode(html_base64enc.strip()).decode("utf-8"), re.S)
            docs[fileid] = set(list(links))
            fileid += 1
def read_urls(f, docs):
    with open_xz_or_gzip_or_plain(f) as fd:
        fileid = 1
        for u in fd:
            u = u.strip()
            rx = re.match('(https?://[^/:]+)', u)
            if rx is not None:
                url_domain = rx.group(1)
                url = u.replace(url_domain, "")
            else:
                url = u
            docs[fileid] = url
            fileid += 1
Beispiel #8
0
def load_extracted(filepath):
    filepath = munge_file_path(filepath)

    with open_xz_or_gzip_or_plain(filepath) as fextract:
        documents = defaultdict(list)

        for line in fextract:
            line_split = line.strip().split('\t', 1)
            if len(line_split) != 2:
                continue

            url, text = line_split
            documents[url].append(text)

        return {d: "\n".join(documents[d]) for d in documents}
Beispiel #9
0
def extract_structure_representations(f, docs):
    with open_xz_or_gzip_or_plain(f) as fd:
        fileid = 1
        dic = {}
        charidx = 32
        dic[''] = '_'

        for html_base64enc in fd:
            p = Parser()
            try:
                e = base64.b64decode(html_base64enc.strip()).decode("utf8")
                if e != "":
                    p.feed(e)
                    raspa = "".join(p.output)
                    if raspa.split('_')[1][-2:] == "ml" and all(
                            ord(char) < 128 for char in
                            raspa):  # Delete entries without *ml in the first
                        # tag to avoid things different than HTML or XML as JPGS or PDF, for example. To compute the
                        # edit distance at the level of characters, HTML tags must be encoded as characters and not
                        # strings:
                        taglist = raspa.split('_')
                        tagset = set(taglist)
                        if '' in tagset:
                            tagset.remove('')
                        # Adding new tags in the raspa and the character with which they will be replaced to the
                        # dictionary
                        for tag in tagset:
                            if tag not in dic:
                                dic[tag] = chr(charidx)
                                charidx += 1
                                if charidx == 95:
                                    charidx += 1
                        translated_taglist = []
                        for tag in taglist:
                            translated_taglist.append(dic[tag])
                        docs[fileid] = "".join(translated_taglist)
                    else:
                        docs[fileid] = " "
            except html.parser.HTMLParseError:
                pass
            finally:
                fileid += 1
Beispiel #10
0
                        help='File containing the plain text extracted from the HTML documents in a WARC file, '
                             'encoded in base64')
    parser.add_argument("--splitter", dest="splitter", default="",
                        help="Sentence splitting command")
    parser.add_argument("--tokenized", dest="tokenized", action="store_true",
                        help='Don\'t apply sentence splitter to the text (split by newlines only).')
    parser.add_argument("--output_prefix", dest="output_prefix", default="", required=False,
                        help="Prefix for output files within directory")
    parser.add_argument("--prune", dest="prune_threshold", type=int, default=80,
                        help="Prune sentences longer than n (words/characters)", required=False)
    parser.add_argument("--prune_type", dest="prune_type", choices={"words", "chars"},
                        default="words", help="Prune sentences either by words or characters", required=False)
    args = parser.parse_args()

    counter = 1
    with open_xz_or_gzip_or_plain(args.text_file) as text_reader:
        for line in text_reader:
            text = base64.b64decode(line.strip()).decode("utf-8")

            if not text:
                continue

            if args.tokenized:
                split = split_sentences(text, None)
            else:
                split = split_sentences(text, args.splitter)

            for extracted_line in split:

                extracted_line = extracted_line.strip()
                if not extracted_line:
Beispiel #11
0
    lang_file = {}
    for l in langs_parse:
        if not l.strip():
            continue
        if args.xz:
            lang_file[l] = lzma.open(
                os.path.join(
                    args.output_dir,
                    "{0}{1}.extracted.xz".format(args.output_prefix, l)), "wb")
        else:
            lang_file[l] = gzip.open(
                os.path.join(
                    args.output_dir,
                    "{0}{1}.extracted.gz".format(args.output_prefix, l)), "wb")

    with open_xz_or_gzip_or_plain(args.textFile) as text_reader, \
            open_xz_or_gzip_or_plain(args.langFile) as lang_reader, \
            open_xz_or_gzip_or_plain(args.urlFile) as url_reader:
        for line in text_reader:
            text = base64.b64decode(line.strip()).decode("utf-8")
            lang = next(lang_reader, None).strip()
            uri = next(url_reader, None).strip()

            if lang not in langs_parse:
                continue

            if not text:
                continue

            for extracted_line in split_sentences(text, args.splitter):
                extracted_line = extracted_line.strip()
Beispiel #12
0
    return b64text.decode()


oparser = argparse.ArgumentParser(
    description=
    "Tool that tokenizes (sentences, tokens and morphemes) plain text")
oparser.add_argument('--text',
                     dest='text',
                     help='Plain text file',
                     required=True)
oparser.add_argument('--sentence-splitter',
                     dest='splitter',
                     required=True,
                     help="Sentence splitter commands")
oparser.add_argument('--word-tokenizer',
                     dest='tokenizer',
                     required=True,
                     help="Word tokenisation command")
oparser.add_argument('--morph-analyser',
                     dest='lemmatizer',
                     help="Morphological analyser command")

options = oparser.parse_args()

with open_xz_or_gzip_or_plain(options.text) as reader:
    for line in reader:
        encoded_text = line.strip()
        tokenized = extract_encoded_text(encoded_text, options.splitter,
                                         options.tokenizer, options.lemmatizer)
        print(tokenized)
Beispiel #13
0
def text2prevertical(text_files,
                     url_files,
                     langs,
                     langs_likelihood,
                     boilerplate_likelihood,
                     min_lang_diff_likelihood,
                     seed=-1):
    random.seed(seed if seed >= 0 else None)

    for text_file, url_file in zip(text_files, url_files):
        with open_xz_or_gzip_or_plain(
                text_file) as text_fd, open_xz_or_gzip_or_plain(
                    url_file) as url_fd:
            for doc_idx, (doc, url) in enumerate(zip(text_fd, url_fd), 1):
                doc = doc.strip()
                url = url.strip().replace('\t', ' ')
                content = ""
                prevertical_content = ""
                current_date = datetime.now().strftime("%Y/%m/%d %H:%M")

                try:
                    content = base64.b64decode(doc.strip()).decode("utf-8")
                except UnicodeDecodeError:
                    logging.warning(
                        "unicode decoding error while processing doc #%d",
                        doc_idx)

                    continue

                title = "this is the title"
                # TODO is it the length format correct?
                length = f"{len(content)}" if len(
                    content) < 1024 else f"{len(content) // 1024}k"
                lang = roulette_wheel_selection(langs, langs_likelihood)
                lang_diff = f"{min(random.uniform(min_lang_diff_likelihood, 1.00005), 1.0):.2f}"
                ip = f"{random.randint(0, 256)}.{random.randint(0, 256)}.{random.randint(0, 256)}.{random.randint(0, 256)}"

                # Greedy document header
                prevertical_content += f"<doc id=\"{doc_idx}\" title=\"{title}\"" \
                                       f" length=\"{length}\" crawl_date=\"{current_date}\"" \
                                       f" lang=\"{lang}\" lang_diff=\"{lang_diff}\"" \
                                       f" ip=\"{ip}\" url=\"{url}\" file_type=\"html\"" \
                                       f" enc_meta=\"utf-8\" enc_chared=\"utf-8\">\n"

                paragraphs = content.strip().replace('\t', ' ').split('\n')
                printed_paragraphs = 0

                for paragraph in paragraphs:
                    paragraph = paragraph.strip()

                    if paragraph == '':
                        continue

                    # Escape HTML entities which might be harmful for XML
                    #  don't escape '&' since we might escape twice previous escaped HTML entities
                    paragraph = paragraph.replace('<', '&lt;') \
                                         .replace('>', '&gt;') \
                                         .replace('\'', '&apos;') \
                                         .replace('"', '&quot;')

                    lang_diff = f"{min(random.uniform(min_lang_diff_likelihood, 1.00005), 1.0):.2f}"
                    boilerplate = "good" if random.random(
                    ) > boilerplate_likelihood else "bad"

                    # Greedy paragraph header
                    prevertical_paragraph = f"<p class=\"{boilerplate}\" cfclass=\"{boilerplate}\" langdiff=\"{lang_diff}\">\n"

                    prevertical_paragraph += paragraph
                    prevertical_paragraph += "\n</p>\n"

                    prevertical_content += prevertical_paragraph

                    printed_paragraphs += 1

                prevertical_content += "</doc>"

                if printed_paragraphs != 0:
                    print(prevertical_content)
Beispiel #14
0
    lang2_read_docs = {}

    if args.indices[:-3] == '.xz':
        reader = lzma.open(args.indices, 'rt')
    elif args.indices[:-3] == '.gz':
        reader = gzip.open(args.indices, 'rt')
    else:
        reader = open(args.indices, 'r')

    for line in reader:
        fields = line.split('\t')
        lang2_docs.add(int(fields[args.column2]))

    reader.seek(0)

    with open_xz_or_gzip_or_plain(args.tokenized1) as tok_reader1, \
            open_xz_or_gzip_or_plain(args.tokenized2) as tok_reader2, \
            open_xz_or_gzip_or_plain(args.text1) as text_reader1, \
            open_xz_or_gzip_or_plain(args.text2) as text_reader2:

        doc1_current_line = 1
        doc2_current_line = 1
        doc2_last_written = 0

        for line in reader:
            fields = line.strip().split('\t')
            doc1 = int(fields[args.column1])
            doc2 = int(fields[args.column2])
            while doc1_current_line <= doc1:
                text1 = next(text_reader1, None).strip()
                tok1 = next(tok_reader1, None).strip()
Beispiel #15
0
        type=int,
        default=80,
        help="Prune sentences longer than n (words/characters)",
        required=False)
    parser.add_argument("--prune_type",
                        dest="prune_type",
                        choices={"words", "chars"},
                        default="words",
                        help="Prune sentences either by words or characters",
                        required=False)
    args = parser.parse_args()

    counter = 0

    if args.tok_file:
        with open_xz_or_gzip_or_plain(args.sent_file) as sent_reader, open_xz_or_gzip_or_plain(args.tok_file) as tok_reader, \
                open_xz_or_gzip_or_plain(args.out_extracted, "wt") as sent_writer, open_xz_or_gzip_or_plain(args.out_tokenized, "wt") as tok_writer:
            for sent_doc in sent_reader:
                counter = counter + 1
                tok_doc = next(tok_reader, None)
                sent_text = base64.b64decode(sent_doc.strip()).decode("utf-8")
                tok_text = base64.b64decode(tok_doc.strip()).decode("utf-8")

                for sent_line, tok_line in zip(sent_text.split("\n"),
                                               tok_text.split("\n")):
                    sent_line = sent_line.strip()
                    tok_line = tok_line.strip()

                    if not sent_line or not tok_line:
                        continue
Beispiel #16
0
    parser.add_argument(
        "--sentences_file",
        dest="sent_file",
        default='-',
        help=
        'File containing the sentence splitted plain text extracted from the HTML documents '
        'in a WARC file, encoded in base64')
    args = parser.parse_args()

    counter = 0

    if not args.sent_file:
        args.sent_file = '-'

    with open_xz_or_gzip_or_plain(
            args.sent_file
    ) if args.sent_file != '-' else sys.stdin as sent_reader:
        for line in sent_reader:
            counter = counter + 1
            text = base64.b64decode(line.strip()).decode("utf-8")
            n_lines = 0

            for extracted_line in text.split("\n"):
                extracted_line = extracted_line.strip()
                if not extracted_line:
                    continue
                n_lines = n_lines + 1
                print(f'{counter}\t{extracted_line}')

            if n_lines == 0:
                print(
Beispiel #17
0
def main(args):
    lett_file = args.lett_file
    langs = args.langs
    output_prefix = args.output_prefix
    preprocess_tool = args.preprocess_tool

    if not os.path.isfile(lett_file):
        raise Exception("LETT file is not a file")
    if not os.path.isdir(output_prefix):
        raise Exception("Output prefix is not a dir")

    output_files = {}
    number_output_files_open = {}
    preprocess_files = ("text", "url", "mime", "html")

    with open_xz_or_gzip_or_plain(lett_file) as lett:
        for idx, line in enumerate(lett, 1):
            line = line.rstrip('\n').split('\t')

            if len(line) != 6:
                logging.warning(
                    "Line %d: unexpected number of columns: %d vs 6", idx,
                    len(line))

            lang, mime, encoding, url, base64_html, base64_text = line
            tsd, td, tsu = extract(url)

            if len(langs) == 0 or lang in langs:
                # Write record

                open_files = lang not in output_files

                # Create necessary idx per domain
                if lang not in number_output_files_open:
                    number_output_files_open[lang] = {}
                    number_output_files_open[lang][td] = 0
                    open_files = True
                elif td not in number_output_files_open[lang]:
                    number_output_files_open[lang][td] = len(
                        number_output_files_open[lang])
                    open_files = True

                if open_files:
                    prefix = f"{output_prefix}/{number_output_files_open[lang][td]}/{preprocess_tool}"

                    if not os.path.isdir(prefix):
                        os.makedirs(prefix)

                    if lang not in output_files:
                        output_files[lang] = {}

                    # Open files
                    output_files[lang][td] = {
                        f: gzip.open(f"{prefix}/{f}.gz", "wb")
                        for f in preprocess_files
                    }

                # Write
                output_files[lang][td]["text"].write(
                    base64_text.encode("utf-8"))
                output_files[lang][td]["url"].write(url.encode("utf-8"))
                output_files[lang][td]["mime"].write(mime.encode("utf-8"))
                output_files[lang][td]["html"].write(
                    base64_html.encode("utf-8"))

                for preprocess_file in preprocess_files:
                    output_files[lang][td][preprocess_file].write(b'\n')

    # Close files
    for lang in output_files:
        for td in output_files[lang]:
            for ft in output_files[lang][td]:
                output_files[lang][td][ft].close()