Ejemplo n.º 1
0
def database_session():
    """Fixture function that creates and drops a database.

    As a setup, a database is created and an SQLAlchemy session is passed to a test.
    As a teardown, the database is dropped after the test runs.
    """
    if "CI" in os.environ:
        con = psycopg2.connect(
            host=os.environ["POSTGRES_HOST"],
            port=os.environ["POSTGRES_PORT"],
            user=os.environ["PGUSER"],
            password=os.environ["PGPASSWORD"],
        )
    else:
        con = psycopg2.connect(host="127.0.0.1", port="5432")
    # Setup
    con.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT)
    cursor = con.cursor()
    cursor.execute(f'create database "{DB}";')
    session = Meta.init(CONN_STRING).Session()
    yield session

    # Teardown
    engine = session.get_bind()
    session.close()
    engine.dispose()
    Meta.engine = None

    cursor.execute(f'drop database "{DB}";')
    cursor.close()
    con.close()
Ejemplo n.º 2
0
def insert_equation_tuple(db, all_equations):
    """
    Insert equations information from XML into the equation table by migrating tuples from the Sentence table.
    :param db: database connection string
    :param all_equations: extra information generated by ``parser_preprocess.py``
    """
    session = Meta.init(db).Session()
    for doc in session.query(Document):
        locs = all_equations['%s.html' % doc.name]
        locs_counter = 0
        eqs_groupby_section_para = defaultdict(lambda: defaultdict(list))
        for sent in session.query(Sentence).filter(Sentence.document_id == doc.id):
            if sent.name == 'Equation':
                eqs_groupby_section_para[sent.section_id][sent.paragraph_id].append(
                    {
                        'text': sent.text,
                        'page': sent.page
                    }
                )

        for sec_id, para_dic in eqs_groupby_section_para.items():
            for paragraph_id, eqs in para_dic.items():
                latex_code = ''.join(map(lambda x: x['text'], eqs))
                # variables = list(get_variables(latex_code))
                variables = []
                if len(variables) == 0 or variables[0] == -1:
                    variables = None

                e = Equation(
                    name='Equation', document_id=doc.id, section_id=sec_id, paragraph_id=paragraph_id,
                    latex=latex_code, variables=variables,
                    top=locs[locs_counter]['ymin'],
                    bottom=locs[locs_counter]['ymax'],
                    left=locs[locs_counter]['xmin'],
                    right=locs[locs_counter]['xmax'],
                    page=locs[locs_counter]['page_num']
                )

                session.add(e)
                locs_counter += 1
        session.commit()
Ejemplo n.º 3
0
def insert_equation_tuple(db, resource_loc):
    """
    Insert equations information into the equation table by migrating tuples from the Sentence table.
    :param db: db connection string.
    :param resource_loc: Directory storing the json files which contain the equation coordinate information. 
    """
    session = Meta.init(db).Session()
    for doc in session.query(Document):
        locs = json.load(open(join(resource_loc, '%s.html.json' % doc.name)))
        print(join(resource_loc, '%s.html.json' % doc.name))
        locs_counter = 0
        for sent in session.query(Sentence).filter(Sentence.document_id == doc.id).order_by(Sentence.paragraph_id):
            if sent.name == 'Equation':
                length_tmp = len(locs[locs_counter]['text'])
                if not sent.text.replace('-', '—').replace('−','—')\
                       .startswith(locs[locs_counter]['text'][:min(5,length_tmp-1)].replace('-', '—').replace('−','—')):
                    print('Not Aligned!!!')
                    #print(sent.id)
                    #print('*****************************************')
                    #print(sent.text)
                    #print('-----------------------------------------')
                    #print(locs[locs_counter]['text'])
                    #print('*****************************************')
                e = Equation(
                    id = sent.id, name='Equation', document_id=doc.id, section_id=sent.section_id, paragraph_id=sent.paragraph_id,
                    text=sent.text, variables=[],
                    top=locs[locs_counter]['ymin'],
                    bottom=locs[locs_counter]['ymax'],
                    left=locs[locs_counter]['xmin'],
                    right=locs[locs_counter]['xmax'],
                    page=locs[locs_counter]['page_num']
                )

                session.add(e)
                locs_counter += 1
        session.commit()
Ejemplo n.º 4
0
import logging

from sqlalchemy import Column, ForeignKey, Integer, String, UniqueConstraint
from sqlalchemy.orm import backref, relationship

from fonduer.meta import Meta
from fonduer.utils.utils import camel_to_under

_meta = Meta.init()
logger = logging.getLogger(__name__)

# This global dictionary contains all classes that have been declared in this
# Python environment, so that candidate_subclass() can return a class if it
# already exists and is identical in specification to the requested class
candidate_subclasses = {}


class Candidate(_meta.Base):
    """
    An abstract candidate relation.

    New relation types should be defined by calling candidate_subclass(),
    **not** subclassing this class directly.
    """

    __tablename__ = "candidate"

    #: The unique id for the ``Candidate``.
    id = Column(Integer, primary_key=True)

    #: The type for the ``Candidate``, which corresponds to the names the user
Ejemplo n.º 5
0
def build_table_X(db, corenlp):
    """
    Build the table containing the symbols and phrases for each equation.
    :param db: db connection string.
    :param corenlp: Location of the CoreNLP java file.
    """
    os.environ["CORENLP_HOME"] = corenlp

    session = Meta.init(db).Session()
    variables = session.query(Variable).order_by(Variable.equation_id)
    equations = session.query(Equation).order_by(Equation.id)
    sentences = session.query(Sentence).order_by(Sentence.id)
    with CoreNLPClient(annotators=['pos', 'depparse']) as client:
        vars_used_index = []
        vars_used_text = []

        for eqt in equations:
            vars_in_eqt = variables.filter(
                Variable.equation_id == eqt.id).order_by(Variable.sentence_id)
            if vars_in_eqt.count() == 0:
                print('No Variable found for equation ' + str(eqt.id))
            else:
                vars_used = []
                sent_used = []
                entities = []
                phrases_top = []
                phrases_bottom = []
                phrases_left = []
                phrases_right = []
                phrases_page = []

                for var in vars_in_eqt:
                    vars_used_index.append((eqt.document_id, var.sentence_id,
                                            var.sentence_offset))
                    PUNC_TO_STRIP = [
                        ',', '.', '?', '(', ')', '{', '}', '[', ']'
                    ]
                    text = var.text
                    for punc in PUNC_TO_STRIP:
                        text = text.strip(punc)
                    vars_used_text.append((eqt.document_id, text))
                    if text not in vars_used:
                        vars_used.append(text)
                for var in vars_in_eqt:
                    sent_id = var.sentence_id
                    target_sent = sentences.filter(Sentence.id == sent_id)[0]

                    top = target_sent.top
                    bottom = target_sent.bottom
                    left = target_sent.left
                    right = target_sent.right
                    page = target_sent.page

                    indices_banned = [
                        t[2] for t in vars_used_index
                        if t[0] == eqt.document_id and t[1] == sent_id
                    ]
                    text_banned = [
                        t[1] for t in vars_used_text if t[0] == eqt.document_id
                    ]

                    if sent_id not in sent_used:
                        sent_used.append(sent_id)
                        sent_text = var.sentence_text
                        ann = client.annotate(sent_text)

                        sentences_ann = ann.sentence
                        trees = parseTreeConstruct(sentences_ann, sent_text)
                        for tree in trees:
                            phrases = get_phrases(tree, sent_text, [])
                            phrases = remove_symbol(phrases, indices_banned,
                                                    text_banned)
                            for phrase in phrases:
                                phrase_text = ''
                                top_tmp = []
                                bottom_tmp = []
                                left_tmp = []
                                right_tmp = []
                                page_tmp = []
                                top_str = ''
                                bottom_str = ''
                                left_str = ''
                                right_str = ''
                                page_str = ''

                                for key in sorted(phrase.keys()):
                                    phrase_text += phrase[key] + ' '
                                    top_tmp.append(top[key])
                                    bottom_tmp.append(bottom[key])
                                    left_tmp.append(left[key])
                                    right_tmp.append(right[key])
                                    page_tmp.append(page[key])

                                entities.append(phrase_text)

                                df = pd.DataFrame({
                                    'top': top_tmp,
                                    'bottom': bottom_tmp,
                                    'left': left_tmp,
                                    'right': right_tmp,
                                    'page': page_tmp
                                })
                                maxV = df.groupby('top').max()
                                minV = df.groupby('top').min()

                                for index, row in minV.iterrows():
                                    top_str += str(index)
                                    top_str += ' '
                                    left_str += str(row['left'])
                                    left_str += ' '
                                    bottom_str += str(row['bottom'])
                                    bottom_str += ' '
                                    page_str += str(row['page'])
                                    page_str += ' '
                                for index, row in maxV.iterrows():
                                    right_str += str(row['right'])
                                    right_str += ' '
                                phrases_top.append(top_str)
                                phrases_left.append(left_str)
                                phrases_right.append(right_str)
                                phrases_bottom.append(bottom_str)
                                phrases_page.append(page_str)

                x = TableX(
                    equation_id=eqt.id,
                    symbols=vars_used,
                    phrases=entities,
                    phrases_top=phrases_top,
                    phrases_bottom=phrases_bottom,
                    phrases_left=phrases_left,
                    phrases_right=phrases_right,
                    phrases_page=phrases_page,
                )

                session.add(x)

        session.commit()
Ejemplo n.º 6
0
def var_in_text(db):
    """
    Extract variables from sentences for each equation.
    :param db: db connection string.
    """
    with open('words_alpha.txt') as word_file:
        valid_words = set(word_file.read().split())

    MAX_RANGE = 2
    session = Meta.init(db).Session()

    def get_all_equations():
        return session.query(Equation).order_by(Equation.id)

    def get_sentences_in_doc(doc_id):
        return session.query(Sentence).filter(Sentence.document_id == doc_id)

    for eqt in get_all_equations():
        sentences = get_sentences_in_doc(eqt.document_id)

        paragraph_id = eqt.paragraph_id

        count = 0
        id_temp = paragraph_id
        while count < MAX_RANGE:
            id_temp -= 1
            sents = sentences.filter(Sentence.paragraph_id == id_temp)
            if sents.count() == 0:
                count = MAX_RANGE
                break
            if sents[0].name == 'Body Text' or sents[0].name == 'Abstract':
                for sent in sents:
                    for idx, word in enumerate(sent.text.split()):
                        tmp = re.sub('[' + string.punctuation + ']', '', word)
                        if tmp in stop_words:
                            continue
                        tmp = tmp.lower()
                        if tmp not in valid_words or len(tmp) <= 2:
                            offset, score = match(word, eqt.text)
                            if offset >= 0:

                                v = Variable(text=word,
                                             document_id=eqt.document_id,
                                             equation_id=eqt.id,
                                             equation_text=eqt.text,
                                             equation_offset=offset,
                                             sentence_id=sent.id,
                                             sentence_offset=idx,
                                             sentence_text=sent.text,
                                             score=score,
                                             var_top=sent.top[idx],
                                             var_bottom=sent.bottom[idx],
                                             var_left=sent.left[idx],
                                             var_right=sent.right[idx],
                                             var_page=sent.page[idx])
                                session.add(v)
                count += 1

        count = 0
        id_temp = paragraph_id
        while count < MAX_RANGE:
            id_temp += 1
            sents = sentences.filter(Sentence.paragraph_id == id_temp)
            if sents.count() == 0:
                count = MAX_RANGE
                break
            if sents[0].name == 'Body Text' or sents[0].name == 'Abstract':
                for sent in sents:
                    for idx, word in enumerate(sent.text.split()):
                        tmp = re.sub('[' + string.punctuation + ']', '', word)
                        if tmp in stop_words:
                            continue
                        tmp = tmp.lower()
                        if tmp not in valid_words or len(tmp) <= 2:
                            offset, score = match(word, eqt.text)
                            if offset >= 0:

                                v = Variable(text=word,
                                             document_id=eqt.document_id,
                                             equation_id=eqt.id,
                                             equation_text=eqt.text,
                                             equation_offset=offset,
                                             sentence_id=sent.id,
                                             sentence_offset=idx,
                                             sentence_text=sent.text,
                                             score=score,
                                             var_top=sent.top[idx],
                                             var_bottom=sent.bottom[idx],
                                             var_left=sent.left[idx],
                                             var_right=sent.right[idx],
                                             var_page=sent.page[idx])
                                session.add(v)

                count += 1
        session.commit()
Ejemplo n.º 7
0
from collections import defaultdict
from itertools import chain
import argparse
from fonduer import Meta
from fonduer.parser.models import Document, Sentence
import json

from fonduer.meta import Meta as Mt
from sqlalchemy import Column, Integer, String, Text, ForeignKey
from sqlalchemy.dialects import postgresql
from os.path import join

import re

STR_ARRAY_TYPE = postgresql.ARRAY(String)
_meta = Mt.init()


class Latex(_meta.Base):
    """Latex representation of sentences"""
    __tablename__ = "latexsentence"

    id = Column(Integer, primary_key=True)

    name = Column(String, unique=False, nullable=True)

    #: The id of the parent ``Document``.
    document_id = Column(Integer)

    #: The id of the parent ``Section``.
    section_id = Column(Integer)