def __init__(self, path="", rootDir=configuration.get_path_to_reports()): self.path = path self.rootDir = rootDir self.CONN_STRING = configuration.get_connection_string() if not self.rootDir.endswith('/'): self.rootDir += '/' self.pathParts = self.path[len(self.rootDir):].split('/')
import os, sys import psycopg2 import argparse import configuration CONN_STRING = configuration.get_connection_string() def get_entities(max_occurence): conn = psycopg2.connect(CONN_STRING) #try: cmd = "select entity_text, entity_inferred_name, count(*) c from entities \ where entity_type != 'Currency' group by entity_text, \ entity_inferred_name having count(*) > %s order by c desc" entities = set() cur = conn.cursor() cur.execute(cmd, (max_occurence, )) records = cur.fetchall() for r in records: entities.add(r[0]) entities.add(r[1]) return entities #except Exception as ex: #print ex #raise ex #finally: conn.close()
def __init__(self, path="", rootDir=configuration.get_path_to_bills()): self.path = path self.rootDir = rootDir self.CONN_STRING = configuration.get_connection_string() if not self.rootDir.endswith('/'): self.rootDir += '/'
import os, sys import codecs import psycopg2 import csv from path_tools import BillPathUtils from sunlight_id_to_path import sunlightid_to_path import configuration CONN_STRING = configuration.get_connection_string() conn = psycopg2.connect(CONN_STRING) cmd = "select distinct bill_id from old_billentities where entity_type = 'Currency'" cur = conn.cursor() cur.execute(cmd) ids = cur.fetchall() split_ids = [ i[0].split("-") for i in ids] bpu = BillPathUtils(); #paths = [bpu.get_bill_path( int(split_id[1]), split_id[0], split_id[2])+'document.txt' for split_id in split_ids] paths = [sunlightid_to_path(i[0]) for i in ids] print paths[0:3] print "bills with Currency", len(ids) #get bill from file # chema id entity_text, entity_type, entity_offset, entity_length, entity_name, bill_id #schema 0 1 2 3 4 5 6 pre_window = 700