def import_rows(rows): CONN_STRING = configuration.get_connection_string() earmarks = [] for row in rows: new_earmark = [] for item in row: if isinstance(item, int): new_earmark.append(item) else: try: new_item = item.decode('latin1').encode('utf8') if len(new_item) >= 4096: new_item = new_item[:4050] new_earmark.append(new_item) except: new_earmark.append(item) earmarks.append(new_earmark) conn = psycopg2.connect(CONN_STRING) cmd = "insert into earmark_documents (earmark_id, document_id, page_number, excerpt) values (%s, %s, %s, %s)" params = rows cur = conn.cursor() cur.executemany(cmd, earmarks) conn.commit() conn.close()
def import_rows(rows): CONN_STRING = configuration.get_connection_string() earmarks = [] for row in rows: new_earmark = [] for item in row: if isinstance(item,int): new_earmark.append(item) else: try: new_item = item.decode('latin1').encode('utf8') if len(new_item) >= 4096: new_item = new_item[:4050] new_earmark.append(new_item) except: new_earmark.append(item) earmarks.append(new_earmark) conn = psycopg2.connect(CONN_STRING) cmd = "insert into earmark_documents (earmark_id, document_id, page_number, excerpt) values (%s, %s, %s, %s)" params = rows cur = conn.cursor() cur.executemany(cmd, earmarks) conn.commit() conn.close()
def __init__(self, **kwargs): self.name = "wikipedia_categories_feature_generator" self.depth = kwargs.get("depth", 3) self.distinguish_levels = kwargs.get("distinguish_levels", True) self.force = kwargs.get("force", True) self.feature_prefix = "WIKI_CATEGORY_" self.NO_WIKI_PAGE_FEATURE = "NO_WIKIPEDIA_PAGE_WAS_FOUND" self.CONN_STRING = configuration.get_connection_string()
import os, sys, inspect sys.path.insert(0, os.path.realpath(os.path.abspath(os.path.join(os.path.split(inspect.getfile( inspect.currentframe() ))[0],"..")))) from util import configuration import psycopg2 import csv, pandas as pd import codecs from pprint import pprint USAGE = "python %s <input-csv-file>" %(sys.argv[0]) CONN_STRING = configuration.get_connection_string() def import_csv_file(path): rows = [] stuff = pd.read_csv(codecs.open(path,'r','utf-8')) i = 0 for row in stuff.iterrows(): v = list(row[1]) i = i+1 #print i if not isinstance(v[3],basestring) or not isinstance(v[2],basestring): v[3] = '' v[2] = '' rows.append(v) conn = psycopg2.connect(CONN_STRING) cmd = "insert into earmark_documents (earmark_id, document_id,page_number,excerpt)\ values (%s, %s, %s, %s)" params = rows; pprint(rows); return cur = conn.cursor() #r.execute ("delete from earmark_documents")
sys.path.insert( 0, os.path.realpath( os.path.abspath( os.path.join( os.path.split(inspect.getfile(inspect.currentframe()))[0], "..")))) from util import configuration import psycopg2 import csv, pandas as pd import codecs from pprint import pprint USAGE = "python %s <input-csv-file>" % (sys.argv[0]) CONN_STRING = configuration.get_connection_string() def import_csv_file(path): rows = [] stuff = pd.read_csv(codecs.open(path, 'r', 'utf-8')) i = 0 for row in stuff.iterrows(): v = list(row[1]) i = i + 1 #print i if not isinstance(v[3], basestring) or not isinstance( v[2], basestring): v[3] = '' v[2] = '' rows.append(v)
import os, sys, inspect sys.path.insert(0, os.path.realpath(os.path.abspath(os.path.join(os.path.split(inspect.getfile( inspect.currentframe() ))[0],"..")))) import argparse import os import psycopg2 import psycopg2.extras from util import configuration CONN_STRING = configuration.get_connection_string()import multiprocessing as mp import string import re import logging from classification.pipe import Pipe from classification.blocks_pipe import BlocksPipe from classification.instances_grouper import InstancesGrouper from classification.prepare_earmark_data import serialize_instances, load_instances from matching.feature_generators.jaccard_feature_generator import JaccardFeatureGenerator from matching.feature_generators.ranking_feature_generator import RankingFeatureGenerator from matching.feature_generators.difference_feature_generator import DifferenceFeatureGenerator from matching.feature_generators.infix_feature_generator import InfixFeatureGenerator from matching.feature_generators.table_feature_generator import TableFeatureGenerator from matching.matching_util import * logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s') MIN = 0.1