def database_session(): """Fixture function that creates and drops a database. As a setup, a database is created and an SQLAlchemy session is passed to a test. As a teardown, the database is dropped after the test runs. """ if "CI" in os.environ: con = psycopg2.connect( host=os.environ["POSTGRES_HOST"], port=os.environ["POSTGRES_PORT"], user=os.environ["PGUSER"], password=os.environ["PGPASSWORD"], ) else: con = psycopg2.connect(host="127.0.0.1", port="5432") # Setup con.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT) cursor = con.cursor() cursor.execute(f'create database "{DB}";') session = Meta.init(CONN_STRING).Session() yield session # Teardown engine = session.get_bind() session.close() engine.dispose() Meta.engine = None cursor.execute(f'drop database "{DB}";') cursor.close() con.close()
def insert_equation_tuple(db, all_equations): """ Insert equations information from XML into the equation table by migrating tuples from the Sentence table. :param db: database connection string :param all_equations: extra information generated by ``parser_preprocess.py`` """ session = Meta.init(db).Session() for doc in session.query(Document): locs = all_equations['%s.html' % doc.name] locs_counter = 0 eqs_groupby_section_para = defaultdict(lambda: defaultdict(list)) for sent in session.query(Sentence).filter(Sentence.document_id == doc.id): if sent.name == 'Equation': eqs_groupby_section_para[sent.section_id][sent.paragraph_id].append( { 'text': sent.text, 'page': sent.page } ) for sec_id, para_dic in eqs_groupby_section_para.items(): for paragraph_id, eqs in para_dic.items(): latex_code = ''.join(map(lambda x: x['text'], eqs)) # variables = list(get_variables(latex_code)) variables = [] if len(variables) == 0 or variables[0] == -1: variables = None e = Equation( name='Equation', document_id=doc.id, section_id=sec_id, paragraph_id=paragraph_id, latex=latex_code, variables=variables, top=locs[locs_counter]['ymin'], bottom=locs[locs_counter]['ymax'], left=locs[locs_counter]['xmin'], right=locs[locs_counter]['xmax'], page=locs[locs_counter]['page_num'] ) session.add(e) locs_counter += 1 session.commit()
def insert_equation_tuple(db, resource_loc): """ Insert equations information into the equation table by migrating tuples from the Sentence table. :param db: db connection string. :param resource_loc: Directory storing the json files which contain the equation coordinate information. """ session = Meta.init(db).Session() for doc in session.query(Document): locs = json.load(open(join(resource_loc, '%s.html.json' % doc.name))) print(join(resource_loc, '%s.html.json' % doc.name)) locs_counter = 0 for sent in session.query(Sentence).filter(Sentence.document_id == doc.id).order_by(Sentence.paragraph_id): if sent.name == 'Equation': length_tmp = len(locs[locs_counter]['text']) if not sent.text.replace('-', '—').replace('−','—')\ .startswith(locs[locs_counter]['text'][:min(5,length_tmp-1)].replace('-', '—').replace('−','—')): print('Not Aligned!!!') #print(sent.id) #print('*****************************************') #print(sent.text) #print('-----------------------------------------') #print(locs[locs_counter]['text']) #print('*****************************************') e = Equation( id = sent.id, name='Equation', document_id=doc.id, section_id=sent.section_id, paragraph_id=sent.paragraph_id, text=sent.text, variables=[], top=locs[locs_counter]['ymin'], bottom=locs[locs_counter]['ymax'], left=locs[locs_counter]['xmin'], right=locs[locs_counter]['xmax'], page=locs[locs_counter]['page_num'] ) session.add(e) locs_counter += 1 session.commit()
import logging from sqlalchemy import Column, ForeignKey, Integer, String, UniqueConstraint from sqlalchemy.orm import backref, relationship from fonduer.meta import Meta from fonduer.utils.utils import camel_to_under _meta = Meta.init() logger = logging.getLogger(__name__) # This global dictionary contains all classes that have been declared in this # Python environment, so that candidate_subclass() can return a class if it # already exists and is identical in specification to the requested class candidate_subclasses = {} class Candidate(_meta.Base): """ An abstract candidate relation. New relation types should be defined by calling candidate_subclass(), **not** subclassing this class directly. """ __tablename__ = "candidate" #: The unique id for the ``Candidate``. id = Column(Integer, primary_key=True) #: The type for the ``Candidate``, which corresponds to the names the user
def build_table_X(db, corenlp): """ Build the table containing the symbols and phrases for each equation. :param db: db connection string. :param corenlp: Location of the CoreNLP java file. """ os.environ["CORENLP_HOME"] = corenlp session = Meta.init(db).Session() variables = session.query(Variable).order_by(Variable.equation_id) equations = session.query(Equation).order_by(Equation.id) sentences = session.query(Sentence).order_by(Sentence.id) with CoreNLPClient(annotators=['pos', 'depparse']) as client: vars_used_index = [] vars_used_text = [] for eqt in equations: vars_in_eqt = variables.filter( Variable.equation_id == eqt.id).order_by(Variable.sentence_id) if vars_in_eqt.count() == 0: print('No Variable found for equation ' + str(eqt.id)) else: vars_used = [] sent_used = [] entities = [] phrases_top = [] phrases_bottom = [] phrases_left = [] phrases_right = [] phrases_page = [] for var in vars_in_eqt: vars_used_index.append((eqt.document_id, var.sentence_id, var.sentence_offset)) PUNC_TO_STRIP = [ ',', '.', '?', '(', ')', '{', '}', '[', ']' ] text = var.text for punc in PUNC_TO_STRIP: text = text.strip(punc) vars_used_text.append((eqt.document_id, text)) if text not in vars_used: vars_used.append(text) for var in vars_in_eqt: sent_id = var.sentence_id target_sent = sentences.filter(Sentence.id == sent_id)[0] top = target_sent.top bottom = target_sent.bottom left = target_sent.left right = target_sent.right page = target_sent.page indices_banned = [ t[2] for t in vars_used_index if t[0] == eqt.document_id and t[1] == sent_id ] text_banned = [ t[1] for t in vars_used_text if t[0] == eqt.document_id ] if sent_id not in sent_used: sent_used.append(sent_id) sent_text = var.sentence_text ann = client.annotate(sent_text) sentences_ann = ann.sentence trees = parseTreeConstruct(sentences_ann, sent_text) for tree in trees: phrases = get_phrases(tree, sent_text, []) phrases = remove_symbol(phrases, indices_banned, text_banned) for phrase in phrases: phrase_text = '' top_tmp = [] bottom_tmp = [] left_tmp = [] right_tmp = [] page_tmp = [] top_str = '' bottom_str = '' left_str = '' right_str = '' page_str = '' for key in sorted(phrase.keys()): phrase_text += phrase[key] + ' ' top_tmp.append(top[key]) bottom_tmp.append(bottom[key]) left_tmp.append(left[key]) right_tmp.append(right[key]) page_tmp.append(page[key]) entities.append(phrase_text) df = pd.DataFrame({ 'top': top_tmp, 'bottom': bottom_tmp, 'left': left_tmp, 'right': right_tmp, 'page': page_tmp }) maxV = df.groupby('top').max() minV = df.groupby('top').min() for index, row in minV.iterrows(): top_str += str(index) top_str += ' ' left_str += str(row['left']) left_str += ' ' bottom_str += str(row['bottom']) bottom_str += ' ' page_str += str(row['page']) page_str += ' ' for index, row in maxV.iterrows(): right_str += str(row['right']) right_str += ' ' phrases_top.append(top_str) phrases_left.append(left_str) phrases_right.append(right_str) phrases_bottom.append(bottom_str) phrases_page.append(page_str) x = TableX( equation_id=eqt.id, symbols=vars_used, phrases=entities, phrases_top=phrases_top, phrases_bottom=phrases_bottom, phrases_left=phrases_left, phrases_right=phrases_right, phrases_page=phrases_page, ) session.add(x) session.commit()
def var_in_text(db): """ Extract variables from sentences for each equation. :param db: db connection string. """ with open('words_alpha.txt') as word_file: valid_words = set(word_file.read().split()) MAX_RANGE = 2 session = Meta.init(db).Session() def get_all_equations(): return session.query(Equation).order_by(Equation.id) def get_sentences_in_doc(doc_id): return session.query(Sentence).filter(Sentence.document_id == doc_id) for eqt in get_all_equations(): sentences = get_sentences_in_doc(eqt.document_id) paragraph_id = eqt.paragraph_id count = 0 id_temp = paragraph_id while count < MAX_RANGE: id_temp -= 1 sents = sentences.filter(Sentence.paragraph_id == id_temp) if sents.count() == 0: count = MAX_RANGE break if sents[0].name == 'Body Text' or sents[0].name == 'Abstract': for sent in sents: for idx, word in enumerate(sent.text.split()): tmp = re.sub('[' + string.punctuation + ']', '', word) if tmp in stop_words: continue tmp = tmp.lower() if tmp not in valid_words or len(tmp) <= 2: offset, score = match(word, eqt.text) if offset >= 0: v = Variable(text=word, document_id=eqt.document_id, equation_id=eqt.id, equation_text=eqt.text, equation_offset=offset, sentence_id=sent.id, sentence_offset=idx, sentence_text=sent.text, score=score, var_top=sent.top[idx], var_bottom=sent.bottom[idx], var_left=sent.left[idx], var_right=sent.right[idx], var_page=sent.page[idx]) session.add(v) count += 1 count = 0 id_temp = paragraph_id while count < MAX_RANGE: id_temp += 1 sents = sentences.filter(Sentence.paragraph_id == id_temp) if sents.count() == 0: count = MAX_RANGE break if sents[0].name == 'Body Text' or sents[0].name == 'Abstract': for sent in sents: for idx, word in enumerate(sent.text.split()): tmp = re.sub('[' + string.punctuation + ']', '', word) if tmp in stop_words: continue tmp = tmp.lower() if tmp not in valid_words or len(tmp) <= 2: offset, score = match(word, eqt.text) if offset >= 0: v = Variable(text=word, document_id=eqt.document_id, equation_id=eqt.id, equation_text=eqt.text, equation_offset=offset, sentence_id=sent.id, sentence_offset=idx, sentence_text=sent.text, score=score, var_top=sent.top[idx], var_bottom=sent.bottom[idx], var_left=sent.left[idx], var_right=sent.right[idx], var_page=sent.page[idx]) session.add(v) count += 1 session.commit()
from collections import defaultdict from itertools import chain import argparse from fonduer import Meta from fonduer.parser.models import Document, Sentence import json from fonduer.meta import Meta as Mt from sqlalchemy import Column, Integer, String, Text, ForeignKey from sqlalchemy.dialects import postgresql from os.path import join import re STR_ARRAY_TYPE = postgresql.ARRAY(String) _meta = Mt.init() class Latex(_meta.Base): """Latex representation of sentences""" __tablename__ = "latexsentence" id = Column(Integer, primary_key=True) name = Column(String, unique=False, nullable=True) #: The id of the parent ``Document``. document_id = Column(Integer) #: The id of the parent ``Section``. section_id = Column(Integer)