Esempio n. 1
0
def test_parse_structure(caplog):
    """Unit test of OmniParserUDF.parse_structure().

    This only tests the structural parse of the document.
    """
    caplog.set_level(logging.INFO)
    logger = logging.getLogger(__name__)
    session = Meta.init('postgres://localhost:5432/' + ATTRIBUTE).Session()

    max_docs = 1
    docs_path = 'tests/data/html_simple/md.html'
    pdf_path = 'tests/data/pdf_simple/md.pdf'

    # Preprocessor for the Docs
    preprocessor = HTMLPreprocessor(docs_path, max_docs=max_docs)

    # Grab one document, text tuple from the preprocessor
    doc, text = next(preprocessor.generate())
    logger.info("    Text: {}".format(text))

    # Create an OmniParserUDF
    omni_udf = OmniParserUDF(
        True,  # structural
        ["style"],  # blacklist
        ["span", "br"],  # flatten
        '',  # flatten delim
        True,  # lingual
        True,  # strip
        [(u'[\u2010\u2011\u2012\u2013\u2014\u2212\uf02d]', '-')],  # replace
        True,  # tabular
        True,  # visual
        pdf_path,  # pdf path
        Spacy())  # lingual parser

    # Grab the phrases parsed by the OmniParser
    phrases = list(omni_udf.parse_structure(doc, text))

    logger.warning("Doc: {}".format(doc))
    for phrase in phrases:
        logger.warning("    Phrase: {}".format(phrase.text))

    header = phrases[0]
    # Test structural attributes
    assert header.xpath == '/html/body/h1'
    assert header.html_tag == 'h1'
    assert header.html_attrs == ['id=sample-markdown']

    # Test the unicode parse of delta
    assert (phrases[-1].text == "δ13Corg")

    # phrases expected in the "md" document.
    assert len(phrases) == 45
Esempio n. 2
0
def test_spacy_integration(caplog):
    """Run a simple e2e parse using spaCy as our parser.

    The point of this test is to actually use the DB just as would be
    done in a notebook by a user.
    """
    #  caplog.set_level(logging.INFO)
    logger = logging.getLogger(__name__)

    PARALLEL = 2  # Travis only gives 2 cores

    session = Meta.init('postgres://localhost:5432/' + ATTRIBUTE).Session()

    docs_path = 'tests/data/html_simple/'
    pdf_path = 'tests/data/pdf_simple/'

    max_docs = 2
    doc_preprocessor = HTMLPreprocessor(docs_path, max_docs=max_docs)

    corpus_parser = OmniParser(
        structural=True, lingual=True, visual=False, pdf_path=pdf_path)
    corpus_parser.apply(doc_preprocessor, parallelism=PARALLEL)

    docs = session.query(Document).order_by(Document.name).all()

    for doc in docs:
        logger.info("Doc: {}".format(doc.name))
        for phrase in doc.phrases:
            logger.info("  Phrase: {}".format(phrase.text))

    assert session.query(Document).count() == 2
    assert session.query(Phrase).count() == 81
Esempio n. 3
0
def parse(docs_path, pdf_path, max_docs):
    doc_preprocessor = HTMLPreprocessor(docs_path, max_docs=max_docs)
    corpus_parser = OmniParser(
        structural=True,
        lingual=True,
        visual=True,
        pdf_path=pdf_path,
        blacklist=['style', 'script', 'meta', 'noscript'])
    corpus_parser.apply(doc_preprocessor, parallelism=PARALLEL)
Esempio n. 4
0
def test_parse_document_md(caplog):
    """Unit test of OmniParser on a single document.

    This tests both the structural and visual parse of the document. This
    also serves as a test of single-threaded parsing.
    """
    logger = logging.getLogger(__name__)
    session = Meta.init('postgres://localhost:5432/' + ATTRIBUTE).Session()

    PARALLEL = 1
    max_docs = 2
    docs_path = 'tests/data/html_simple/'
    pdf_path = 'tests/data/pdf_simple/'

    # Preprocessor for the Docs
    preprocessor = HTMLPreprocessor(docs_path, max_docs=max_docs)

    # Create an OmniParser and parse the md document
    omni = OmniParser(structural=True,
                      lingual=True,
                      visual=True,
                      pdf_path=pdf_path)
    omni.apply(preprocessor, parallelism=PARALLEL)

    # Grab the md document
    doc = session.query(Document).order_by(Document.name).all()[1]

    logger.info("Doc: {}".format(doc))
    for phrase in doc.phrases:
        logger.info("    Phrase: {}".format(phrase.text))

    header = doc.phrases[0]
    # Test structural attributes
    assert header.xpath == '/html/body/h1'
    assert header.html_tag == 'h1'
    assert header.html_attrs == ['id=sample-markdown']

    # Test visual attributes
    assert header.page == [1, 1]
    assert header.top == [35, 35]
    assert header.bottom == [61, 61]
    assert header.right == [111, 231]
    assert header.left == [35, 117]

    # Test lingual attributes
    assert header.ner_tags == ['O', 'O']
    assert header.dep_labels == ['compound', 'ROOT']

    # 44 phrases expected in the "md" document.
    assert len(doc.phrases) == 45
def test_parse_document_diseases(caplog):
    """Unit test of OmniParser on a single document.

    This tests both the structural and visual parse of the document.
    """
    caplog.set_level(logging.INFO)
    logger = logging.getLogger(__name__)
    session = SnorkelSession()

    PARALLEL = 2
    max_docs = 2
    docs_path = 'tests/data/html_simple/'
    pdf_path = 'tests/data/pdf_simple/'

    # Preprocessor for the Docs
    preprocessor = HTMLPreprocessor(docs_path, max_docs=max_docs)

    # Create an OmniParser and parse the md document
    omni = OmniParser(structural=True,
                      lingual=True,
                      visual=True,
                      pdf_path=pdf_path)
    omni.apply(preprocessor, parallel=PARALLEL)

    # Grab the diseases document
    doc = session.query(Document).order_by(Document.name).all()[0]

    logger.info("Doc: {}".format(doc))
    for phrase in doc.phrases:
        logger.info("    Phrase: {}".format(phrase.text))

    phrase = sorted(doc.phrases)[11]
    logger.info("  {}".format(phrase))
    # Test structural attributes
    assert phrase.xpath == '/html/body/table[1]/tbody/tr[3]/td[1]/p'
    assert phrase.html_tag == 'p'
    assert phrase.html_attrs == ['class=s6', 'style=padding-top: 1pt']

    # Test visual attributes
    assert phrase.page == [1, 1, 1]
    assert phrase.top == [342, 296, 356]
    assert phrase.left == [318, 369, 318]

    # Test lingual attributes
    assert phrase.ner_tags == ['O', 'O', 'GPE']
    assert phrase.dep_labels == ['ROOT', 'prep', 'pobj']

    # 44 phrases expected in the "diseases" document.
    assert len(doc.phrases) == 36
Esempio n. 6
0
def test_simple_tokenizer(caplog):
    """Unit test of OmniParser on a single document with lingual features off.
    """
    caplog.set_level(logging.INFO)
    logger = logging.getLogger(__name__)
    session = Meta.init('postgres://localhost:5432/' + ATTRIBUTE).Session()

    PARALLEL = 2
    max_docs = 2
    docs_path = 'tests/data/html_simple/'
    pdf_path = 'tests/data/pdf_simple/'

    # Preprocessor for the Docs
    preprocessor = HTMLPreprocessor(docs_path, max_docs=max_docs)

    omni = OmniParser(
        structural=True, lingual=False, visual=True, pdf_path=pdf_path)
    omni.apply(preprocessor, parallelism=PARALLEL)

    doc = session.query(Document).order_by(Document.name).all()[1]

    logger.info("Doc: {}".format(doc))
    for i, phrase in enumerate(doc.phrases):
        logger.info("    Phrase[{}]: {}".format(i, phrase.text))

    header = doc.phrases[0]
    # Test structural attributes
    assert header.xpath == '/html/body/h1'
    assert header.html_tag == 'h1'
    assert header.html_attrs == ['id=sample-markdown']

    # Test lingual attributes
    assert header.ner_tags == ['', '']
    assert header.dep_labels == ['', '']
    assert header.dep_parents == [0, 0]
    assert header.lemmas == ['', '']
    assert header.pos_tags == ['', '']

    assert len(doc.phrases) == 44
Esempio n. 7
0
def test_parse_style(caplog):
    """Test style tag parsing."""
    caplog.set_level(logging.INFO)
    logger = logging.getLogger(__name__)
    session = Meta.init('postgres://localhost:5432/' + ATTRIBUTE).Session()

    max_docs = 1
    docs_path = 'tests/data/html_extended/ext_diseases.html'
    pdf_path = 'tests/data/pdf_extended/ext_diseases.pdf'

    # Preprocessor for the Docs
    preprocessor = HTMLPreprocessor(docs_path, max_docs=max_docs)

    # Grab the document, text tuple from the preprocessor
    doc, text = next(preprocessor.generate())
    logger.info("    Text: {}".format(text))

    # Create an OmniParserUDF
    omni_udf = OmniParserUDF(
        True,           # structural
        [],             # blacklist, empty so that style is not blacklisted
        ["span", "br"],  # flatten
        '',             # flatten delim
        True,           # lingual
        True,           # strip
        [],             # replace
        True,           # tabular
        True,           # visual
        pdf_path,       # pdf path
        Spacy())        # lingual parser

    # Grab the phrases parsed by the OmniParser
    phrases = list(omni_udf.parse_structure(doc, text))

    logger.warning("Doc: {}".format(doc))
    for phrase in phrases:
        logger.warning("    Phrase: {}".format(phrase.html_attrs))

    # Phrases for testing
    sub_phrases = [
        {
            'index': 7,
            'attr': [
                'class=col-header',
                'hobbies=work:hard;play:harder',
                'type=phenotype',
                'style=background: #f1f1f1; color: aquamarine; font-size: 18px;'
            ]
        },
        {
            'index': 10,
            'attr': ['class=row-header', 'style=background: #f1f1f1;']
        },
        {
            'index': 12,
            'attr': ['class=cell', 'style=text-align: center;']
        }
    ]
    
    # Assertions
    assert(all(phrases[p['index']].html_attrs == p['attr'] for p in sub_phrases))
Esempio n. 8
0
session = SnorkelSession()

from fonduer import candidate_subclass

Org_Fig = candidate_subclass('Org_Fig', ['organic', 'figure'])

from fonduer import HTMLPreprocessor, OmniParser

docs_path = os.environ[
    'FONDUERHOME'] + '/tutorials/organic_synthesis_figures/data/html/'
pdf_path = os.environ[
    'FONDUERHOME'] + '/tutorials/organic_synthesis_figures/data/pdf/'

max_docs = float(4)
doc_preprocessor = HTMLPreprocessor(docs_path, max_docs=max_docs)

corpus_parser = OmniParser(structural=True,
                           lingual=True,
                           visual=True,
                           pdf_path=pdf_path)
corpus_parser.apply(doc_preprocessor, parallelism=PARALLEL)

from fonduer import Document, Phrase, Figure

docs = session.query(Document).order_by(Document.name).all()
ld = len(docs)

train_docs = set()
dev_docs = set()
test_docs = set()
Esempio n. 9
0
def test_e2e(caplog):
    """Run an end-to-end test on 20 documents of the hardware domain."""
    caplog.set_level(logging.INFO)
    PARALLEL = 2
    max_docs = 12

    session = SnorkelSession()

    Part_Attr = candidate_subclass('Part_Attr', ['part', 'attr'])

    docs_path = 'tests/e2e/data/html/'
    pdf_path = 'tests/e2e/data/pdf/'

    doc_preprocessor = HTMLPreprocessor(docs_path, max_docs=max_docs)

    corpus_parser = OmniParser(
        structural=True, lingual=True, visual=True, pdf_path=pdf_path)
    corpus_parser.apply(doc_preprocessor, parallelism=PARALLEL)

    num_docs = session.query(Document).count()
    logger.info("Docs: {}".format(num_docs))
    assert num_docs == max_docs

    num_phrases = session.query(Phrase).count()
    logger.info("Phrases: {}".format(num_phrases))
    #  assert num_phrases == 20

    # Divid into test and train
    docs = session.query(Document).order_by(Document.name).all()
    ld = len(docs)

    train_docs = set()
    dev_docs = set()
    test_docs = set()
    splits = (0.5, 0.75)
    data = [(doc.name, doc) for doc in docs]
    data.sort(key=lambda x: x[0])
    for i, (doc_name, doc) in enumerate(data):
        if i < splits[0] * ld:
            train_docs.add(doc)
        elif i < splits[1] * ld:
            dev_docs.add(doc)
        else:
            test_docs.add(doc)
    logger.info([x.name for x in train_docs])

    attr_matcher = RegexMatchSpan(
        rgx=r'(?:[1][5-9]|20)[05]', longest_match_only=False)

    ### Transistor Naming Conventions as Regular Expressions ###
    eeca_rgx = r'([ABC][A-Z][WXYZ]?[0-9]{3,5}(?:[A-Z]){0,5}[0-9]?[A-Z]?(?:-[A-Z0-9]{1,7})?(?:[-][A-Z0-9]{1,2})?(?:\/DG)?)'
    jedec_rgx = r'(2N\d{3,4}[A-Z]{0,5}[0-9]?[A-Z]?)'
    jis_rgx = r'(2S[ABCDEFGHJKMQRSTVZ]{1}[\d]{2,4})'
    others_rgx = r'((?:NSVBC|SMBT|MJ|MJE|MPS|MRF|RCA|TIP|ZTX|ZT|ZXT|TIS|TIPL|DTC|MMBT|SMMBT|PZT|FZT|STD|BUV|PBSS|KSC|CXT|FCX|CMPT){1}[\d]{2,4}[A-Z]{0,5}(?:-[A-Z0-9]{0,6})?(?:[-][A-Z0-9]{0,1})?)'

    part_rgx = '|'.join([eeca_rgx, jedec_rgx, jis_rgx, others_rgx])
    part_rgx_matcher = RegexMatchSpan(rgx=part_rgx, longest_match_only=True)

    def get_digikey_parts_set(path):
        """
        Reads in the digikey part dictionary and yeilds each part.
        """
        all_parts = set()
        with open(path, "r") as csvinput:
            reader = csv.reader(csvinput)
            for line in reader:
                (part, url) = line
                all_parts.add(part)
        return all_parts

    ### Dictionary of known transistor parts ###
    dict_path = 'tests/e2e/data/digikey_part_dictionary.csv'
    part_dict_matcher = DictionaryMatch(d=get_digikey_parts_set(dict_path))

    def common_prefix_length_diff(str1, str2):
        for i in range(min(len(str1), len(str2))):
            if str1[i] != str2[i]:
                return min(len(str1), len(str2)) - i
        return 0

    def part_file_name_conditions(attr):
        file_name = attr.sentence.document.name
        if len(file_name.split('_')) != 2: return False
        if attr.get_span()[0] == '-': return False
        name = attr.get_span().replace('-', '')
        return any(char.isdigit() for char in name) and any(
            char.isalpha() for char in name) and common_prefix_length_diff(
                file_name.split('_')[1], name) <= 2

    add_rgx = '^[A-Z0-9\-]{5,15}$'

    part_file_name_lambda_matcher = LambdaFunctionMatcher(
        func=part_file_name_conditions)
    part_file_name_matcher = Intersect(
        RegexMatchSpan(rgx=add_rgx, longest_match_only=True),
        part_file_name_lambda_matcher)

    part_matcher = Union(part_rgx_matcher, part_dict_matcher,
                         part_file_name_matcher)

    part_ngrams = OmniNgramsPart(parts_by_doc=None, n_max=3)
    attr_ngrams = OmniNgramsTemp(n_max=2)

    def stg_temp_filter(c):
        (part, attr) = c
        if same_table((part, attr)):
            return (is_horz_aligned((part, attr)) or is_vert_aligned(
                (part, attr)))
        return True

    candidate_filter = stg_temp_filter

    candidate_extractor = CandidateExtractor(
        Part_Attr, [part_ngrams, attr_ngrams], [part_matcher, attr_matcher],
        candidate_filter=candidate_filter)

    candidate_extractor.apply(train_docs, split=0, parallelism=PARALLEL)

    train_cands = session.query(Part_Attr).filter(Part_Attr.split == 0).all()
    logger.info("Number of candidates: {}".format(len(train_cands)))

    for i, docs in enumerate([dev_docs, test_docs]):
        candidate_extractor.apply(docs, split=i + 1)
        logger.info("Number of candidates: {}".format(
            session.query(Part_Attr).filter(Part_Attr.split == i + 1).count()))

    featurizer = BatchFeatureAnnotator(Part_Attr)
    F_train = featurizer.apply(
        split=0, replace_key_set=True, parallelism=PARALLEL)
    logger.info(F_train.shape)
    F_dev = featurizer.apply(
        split=1, replace_key_set=False, parallelism=PARALLEL)
    logger.info(F_dev.shape)
    F_test = featurizer.apply(
        split=2, replace_key_set=False, parallelism=PARALLEL)
    logger.info(F_test.shape)

    gold_file = 'tests/e2e/data/hardware_tutorial_gold.csv'
    load_hardware_labels(
        session, Part_Attr, gold_file, ATTRIBUTE, annotator_name='gold')

    def LF_storage_row(c):
        return 1 if 'storage' in get_row_ngrams(c.attr) else 0

    def LF_temperature_row(c):
        return 1 if 'temperature' in get_row_ngrams(c.attr) else 0

    def LF_operating_row(c):
        return 1 if 'operating' in get_row_ngrams(c.attr) else 0

    def LF_tstg_row(c):
        return 1 if overlap(['tstg', 'stg', 'ts'], list(
            get_row_ngrams(c.attr))) else 0

    def LF_to_left(c):
        return 1 if 'to' in get_left_ngrams(c.attr, window=2) else 0

    def LF_negative_number_left(c):
        return 1 if any([
            re.match(r'-\s*\d+', ngram)
            for ngram in get_left_ngrams(c.attr, window=4)
        ]) else 0

    stg_temp_lfs = [
        LF_storage_row, LF_operating_row, LF_temperature_row, LF_tstg_row,
        LF_to_left, LF_negative_number_left
    ]

    labeler = BatchLabelAnnotator(Part_Attr, lfs=stg_temp_lfs)
    L_train = labeler.apply(split=0, clear=True, parallelism=PARALLEL)
    logger.info(L_train.shape)

    L_gold_train = load_gold_labels(session, annotator_name='gold', split=0)

    gen_model = GenerativeModel()
    gen_model.train(
        L_train,
        epochs=500,
        decay=0.9,
        step_size=0.001 / L_train.shape[0],
        reg_param=0)
    logger.info("LF Accuracy: {}".format(gen_model.weights.lf_accuracy))

    L_gold_dev = load_gold_labels(session, annotator_name='gold', split=1)

    train_marginals = gen_model.marginals(L_train)

    disc_model = SparseLogisticRegression()
    disc_model.train(F_train, train_marginals, n_epochs=200, lr=0.001)

    L_gold_test = load_gold_labels(session, annotator_name='gold', split=2)

    test_candidates = [
        F_test.get_candidate(session, i) for i in range(F_test.shape[0])
    ]
    test_score = disc_model.predictions(F_test)
    true_pred = [
        test_candidates[_] for _ in np.nditer(np.where(test_score > 0))
    ]

    pickle_file = 'tests/e2e/data/parts_by_doc_dict.pkl'
    with open(pickle_file, 'rb') as f:
        parts_by_doc = pickle.load(f)

    (TP, FP, FN) = entity_level_f1(
        true_pred, gold_file, ATTRIBUTE, test_docs, parts_by_doc=parts_by_doc)

    tp_len = len(TP)
    fp_len = len(FP)
    fn_len = len(FN)
    prec = tp_len / (tp_len + fp_len) if tp_len + fp_len > 0 else float('nan')
    rec = tp_len / (tp_len + fn_len) if tp_len + fn_len > 0 else float('nan')
    f1 = 2 * (prec * rec) / (prec + rec) if prec + rec > 0 else float('nan')

    logger.info("prec: {}".format(prec))
    logger.info("rec: {}".format(rec))
    logger.info("f1: {}".format(f1))

    assert f1 < 0.7 and f1 > 0.4

    def LF_test_condition_aligned(c):
        return -1 if overlap(['test', 'condition'],
                             list(get_aligned_ngrams(c.attr))) else 0

    def LF_collector_aligned(c):
        return -1 if overlap([
            'collector', 'collector-current', 'collector-base',
            'collector-emitter'
        ], list(get_aligned_ngrams(c.attr))) else 0

    def LF_current_aligned(c):
        return -1 if overlap(['current', 'dc', 'ic'],
                             list(get_aligned_ngrams(c.attr))) else 0

    def LF_voltage_row_temp(c):
        return -1 if overlap(['voltage', 'cbo', 'ceo', 'ebo', 'v'],
                             list(get_aligned_ngrams(c.attr))) else 0

    def LF_voltage_row_part(c):
        return -1 if overlap(['voltage', 'cbo', 'ceo', 'ebo', 'v'],
                             list(get_aligned_ngrams(c.attr))) else 0

    def LF_typ_row(c):
        return -1 if overlap(['typ', 'typ.'],
                             list(get_row_ngrams(c.attr))) else 0

    def LF_complement_left_row(c):
        return -1 if (overlap(['complement', 'complementary'],
                              chain.from_iterable([
                                  get_row_ngrams(c.part),
                                  get_left_ngrams(c.part, window=10)
                              ]))) else 0

    def LF_too_many_numbers_row(c):
        num_numbers = list(get_row_ngrams(c.attr,
                                          attrib="ner_tags")).count('number')
        return -1 if num_numbers >= 3 else 0

    def LF_temp_on_high_page_num(c):
        return -1 if c.attr.get_attrib_tokens('page')[0] > 2 else 0

    def LF_temp_outside_table(c):
        return -1 if not c.attr.sentence.is_tabular() is None else 0

    def LF_not_temp_relevant(c):
        return -1 if not overlap(
            ['storage', 'temperature', 'tstg', 'stg', 'ts'],
            list(get_aligned_ngrams(c.attr))) else 0

    stg_temp_lfs_2 = [
        LF_test_condition_aligned, LF_collector_aligned, LF_current_aligned,
        LF_voltage_row_temp, LF_voltage_row_part, LF_typ_row,
        LF_complement_left_row, LF_too_many_numbers_row,
        LF_temp_on_high_page_num, LF_temp_outside_table, LF_not_temp_relevant
    ]

    labeler = BatchLabelAnnotator(Part_Attr, lfs=stg_temp_lfs_2)
    L_train = labeler.apply(
        split=0,
        clear=False,
        update_keys=True,
        update_values=True,
        parallelism=PARALLEL)
    gen_model = GenerativeModel()
    gen_model.train(
        L_train,
        epochs=500,
        decay=0.9,
        step_size=0.001 / L_train.shape[0],
        reg_param=0)
    train_marginals = gen_model.marginals(L_train)

    disc_model = SparseLogisticRegression()
    disc_model.train(F_train, train_marginals, n_epochs=200, lr=0.001)

    test_candidates = [
        F_test.get_candidate(session, i) for i in range(F_test.shape[0])
    ]
    test_score = disc_model.predictions(F_test)
    true_pred = [
        test_candidates[_] for _ in np.nditer(np.where(test_score > 0))
    ]

    (TP, FP, FN) = entity_level_f1(
        true_pred, gold_file, ATTRIBUTE, test_docs, parts_by_doc=parts_by_doc)

    tp_len = len(TP)
    fp_len = len(FP)
    fn_len = len(FN)
    prec = tp_len / (tp_len + fp_len) if tp_len + fp_len > 0 else float('nan')
    rec = tp_len / (tp_len + fn_len) if tp_len + fn_len > 0 else float('nan')
    f1 = 2 * (prec * rec) / (prec + rec) if prec + rec > 0 else float('nan')

    logger.info("prec: {}".format(prec))
    logger.info("rec: {}".format(rec))
    logger.info("f1: {}".format(f1))

    assert f1 > 0.7