def _train(train_raw): models = {} all_docs = {} for topic in train_raw: x_train = [] y_train = [] for inst in train_raw[topic]: feature_vector = [ ext.extract(inst[0], inst[1]) for ext in self.extractors ] x_train.append(feature_vector) y_train.append(inst[2]) svm = Supervised(self.args, self.opts) if not os.path.exists(constants.get_path()['tmp'] + '/ltr-features-all'): with open(constants.get_path()['tmp'] + '/ltr-features-all', 'wb') as mf: json.dump({ 'x_train': x_train, 'y_train': y_train }, mf, indent=2) svm.train(x_train, y_train) all_docs = [ inst[1] for topc in train_raw for inst in train_raw[topc] ] return svm, all_docs
def _train(train_raw): models = {} all_docs = {} for topic in train_raw: x_train = [] y_train = [] for inst in train_raw[topic]: feature_vector = [ ext.extract(inst[0], inst[1]) for ext in self.extractors ] x_train.append(feature_vector) y_train.append(inst[2]) svm = Supervised(self.args, self.opts) with open( constants.get_path()['tmp'] + '/ltr-features-%s' % topic, 'wb') as mf: json.dump({ 'x_train': x_train, 'y_train': y_train }, mf, indent=2) svm.train(x_train, y_train) models[topic.lower()] = svm all_docs[topic] = [inst[1] for inst in train_raw[topic]] return models, all_docs
def train(self, train_set): ''' The training data come from docs_path and json_data_path In init: self.train_raw ''' @object_hashing( cache_comment='svm_models_%s' % hash_obj( train_set), cachedir=constants.get_path()['cache']) def _train(train_raw): models = {} all_docs = {} for topic in train_raw: x_train = [] y_train = [] for inst in train_raw[topic]: feature_vector = [ ext.extract(inst[0], inst[1]) for ext in self.extractors] x_train.append(feature_vector) y_train.append(inst[2]) svm = Supervised(self.args, self.opts) if not os.path.exists(constants.get_path()['tmp'] + '/ltr-features-all'): with open(constants.get_path()['tmp'] + '/ltr-features-all', 'wb') as mf: json.dump( {'x_train': x_train, 'y_train': y_train}, mf, indent=2) svm.train(x_train, y_train) all_docs = [inst[1] for topc in train_raw for inst in train_raw[topc]] return svm, all_docs self.model, self.all_docs =\ _train(train_set)
def train(self, train_set): ''' Train_set is null, the training data come from docs_path and json_data_path ''' @object_hashing( cache_comment='svm_models_%s' % hash_obj( train_set), cachedir=constants.get_path()['cache']) def _train(train_raw): models = {} all_docs = {} for topic in train_raw: x_train = [] y_train = [] for inst in train_raw[topic]: feature_vector = [ ext.extract(inst[0], inst[1]) for ext in self.extractors] x_train.append(feature_vector) y_train.append(inst[2]) svm = Supervised(self.args, self.opts) with open(constants.get_path()['tmp'] + '/ltr-features-%s' % topic, 'wb') as mf: json.dump( {'x_train': x_train, 'y_train': y_train}, mf, indent=2) svm.train(x_train, y_train) models[topic.lower()] = svm all_docs[topic] = [inst[1] for inst in train_raw[topic]] return models, all_docs self.models, self.all_docs =\ _train(train_set)
def _train(train_raw): models = {} all_docs = {} for topic in train_raw: x_train = [] y_train = [] for inst in train_raw[topic]: feature_vector = [ ext.extract(inst[0], inst[1]) for ext in self.extractors] x_train.append(feature_vector) y_train.append(inst[2]) svm = Supervised(self.args, self.opts) if not os.path.exists(constants.get_path()['tmp'] + '/ltr-features-all'): with open(constants.get_path()['tmp'] + '/ltr-features-all', 'wb') as mf: json.dump( {'x_train': x_train, 'y_train': y_train}, mf, indent=2) svm.train(x_train, y_train) all_docs = [inst[1] for topc in train_raw for inst in train_raw[topc]] return svm, all_docs
def __init__(self, host="localhost", port=9200, index_name="biosum", cred_path=".cred"): self.host = host self.port = port self.index_name = index_name self.cred_path = cred_path # self.doc_type = 'papers' self.es = self.__connect() self.ic = IndicesClient(self.es) try: cache_file = constants.get_path()["cache"] self.page_cache = shelve.open(cache_file + "/pages.p", writeback=False) except: print "Not found: %s" % cache_file print sys.exc_info()[0] sys.exit()
def load_cursor_from_image(self, filename): i = pygame.image.load(constants.get_path(filename)) i = pygame.transform.flip(i, True, False) i = pygame.transform.rotate(i, 90) size = i.get_size() cur = [] for x in xrange(size[0]): n = "" for y in xrange(size[1]): val = i.get_at((x, y))[0:3] if val == (0, 0, 0): n = n + "X" elif val == (255, 0, 0): n = n + "." else: n = n + " " cur.append(n) return (size, (0,0)) + pygame.cursors.compile(cur)
def __init__(self, host='localhost', port=9200, index_name='biosum', cred_path='.cred'): self.host = host self.port = port self.index_name = index_name self.cred_path = cred_path # self.doc_type = 'papers' self.es = self.__connect() self.ic = IndicesClient(self.es) try: cache_file = constants.get_path()['cache'] self.page_cache = shelve.open(cache_file + '/pages.p', writeback=False) except: print 'Not found: %s' % cache_file print sys.exc_info()[0] sys.exit()
def _train(train_raw): models = {} all_docs = {} for topic in train_raw: x_train = [] y_train = [] for inst in train_raw[topic]: feature_vector = [ ext.extract(inst[0], inst[1]) for ext in self.extractors] x_train.append(feature_vector) y_train.append(inst[2]) svm = Supervised(self.args, self.opts) with open(constants.get_path()['tmp'] + '/ltr-features-%s' % topic, 'wb') as mf: json.dump( {'x_train': x_train, 'y_train': y_train}, mf, indent=2) svm.train(x_train, y_train) models[topic.lower()] = svm all_docs[topic] = [inst[1] for inst in train_raw[topic]] return models, all_docs
def train(self, train_set): ''' The training data come from docs_path and json_data_path In init: self.train_raw ''' @object_hashing(cache_comment='svm_models_%s' % hash_obj(train_set), cachedir=constants.get_path()['cache']) def _train(train_raw): models = {} all_docs = {} for topic in train_raw: x_train = [] y_train = [] for inst in train_raw[topic]: feature_vector = [ ext.extract(inst[0], inst[1]) for ext in self.extractors ] x_train.append(feature_vector) y_train.append(inst[2]) svm = Supervised(self.args, self.opts) if not os.path.exists(constants.get_path()['tmp'] + '/ltr-features-all'): with open(constants.get_path()['tmp'] + '/ltr-features-all', 'wb') as mf: json.dump({ 'x_train': x_train, 'y_train': y_train }, mf, indent=2) svm.train(x_train, y_train) all_docs = [ inst[1] for topc in train_raw for inst in train_raw[topc] ] return svm, all_docs self.model, self.all_docs =\ _train(train_set)
def train(self, train_set): ''' Train_set is null, the training data come from docs_path and json_data_path ''' @object_hashing(cache_comment='svm_models_%s' % hash_obj(train_set), cachedir=constants.get_path()['cache']) def _train(train_raw): models = {} all_docs = {} for topic in train_raw: x_train = [] y_train = [] for inst in train_raw[topic]: feature_vector = [ ext.extract(inst[0], inst[1]) for ext in self.extractors ] x_train.append(feature_vector) y_train.append(inst[2]) svm = Supervised(self.args, self.opts) with open( constants.get_path()['tmp'] + '/ltr-features-%s' % topic, 'wb') as mf: json.dump({ 'x_train': x_train, 'y_train': y_train }, mf, indent=2) svm.train(x_train, y_train) models[topic.lower()] = svm all_docs[topic] = [inst[1] for inst in train_raw[topic]] return models, all_docs self.models, self.all_docs =\ _train(train_set)
def scrape_climate_data(): path = get_path() for year in range(2013, 2019): for month in range(1, 13): if month < 10: url = 'https://en.tutiempo.net/climate/0{}-{}/ws-421820.html'.format( month, year) else: url = 'https://en.tutiempo.net/climate/{}-{}/ws-421820.html'.format( month, year) data = requests.get(url) data_utf = data.text.encode('utf=8') if not os.path.exists('{}/assets/climate-data/{}'.format( path, year)): os.makedirs('{}/assets/climate-data/{}'.format(path, year)) with open( '{}/assets/climate-data/{}/{}.html'.format( path, year, month), 'wb') as result: result.write(data_utf) sys.stdout.flush()
import re import os import sys from functools import wraps from constants import get_path from pprint import pformat from nltk.tokenize.regexp import RegexpTokenizer from nltk.stem.wordnet import WordNetLemmatizer from nltk.stem.porter import PorterStemmer import codecs import math stemmer = PorterStemmer() lmtzr = WordNetLemmatizer() tokenizer = RegexpTokenizer('[^\w\-\']+', gaps=True) STOPWORDS = get_path()['data'] + '/stopwords.txt' with file(STOPWORDS) as f: stopwords = frozenset([l.strip().lower() for l in f]) from optparse import OptionParser from time import time as now try: import matplotlib.pyplot as plt except ImportError: pass try: import numpy as np except ImportError: pass
def get_default_theme(): return load_theme(constants.get_path("default_theme"))
from rerank.null import Reranker as RerankInterface import json import codecs import os import sys from copy import deepcopy from libs.evaluate import merge_offsets from libs.supervised.prep.prepare import Prep from constants import get_path, join_path from libs.supervised.classifiers.svm_rank import Supervised from util.common import hash_obj from util.cache import simple_caching, object_hashing from importlib import import_module import constants path = get_path() STOPWORDS_PATH = path['data'] + '/stopwords.txt' CLF_PATH = join_path(path['root'], 'libs/supervised/classifiers') docs_path = join_path(path['data'], 'TAC_2014_BiomedSumm_Training_Data') json_data_path = join_path(path['data'], 'v1-2a.json') # root_proj_path = os.getcwd() # while not('.git' in os.listdir(root_proj_path)): # root_proj_path = os.path.split(root_proj_path)[0] # if not(root_proj_path in sys.path): # sys.path.append(root_proj_path) class Reranker(RerankInterface): reranker_opts = {
from random import randint from copy import deepcopy import itertools from log_conf import Logger from summarizer.mmr_summarizer import MMR from util.aritmatic_operations import mean_conf from util.tokenization import WordTokenizer from util.common import write_json_as_csv, hash_obj, hash_dict import gzip w_t = WordTokenizer(stem=False) logger = Logger(__file__.split('/')[-1]).logger path = constants.get_path() result_outpath = 'tmp/tmpres/' _ANNS_DIR = path['ann'] _ANNS_PATH = path['ann_json'] CACHE = path['cache'] valid_topics = ['all'] # doc_mod = DocumentsModel(_ANNS_DIR) CACHE_FILE = constants.join_path( CACHE, 'umls.json') if os.path.isfile(CACHE_FILE): try: with codecs.open(CACHE_FILE, 'rb', 'utf-8') as mf: cachefile = json.load(mf)
def get_data(docs_path=constants.get_path()['ann'], json_data_path=constants.get_path()['ann_json']): ''' Populates the docs_new object which stores information about the topics format of the docs_new: [ <list>(dict) keys (topic_id): 'd1418_train', ...: {'d1418_train' : [ <list>(dict) keys (citance_number): u'1', u'2', ... {u'11': [ <list>(dict) keys (annotator_id): 'I', 'B',... {u'I': <dict>, keys:'ref_art', 'not_relevant', 'cit_offset', 'cit_art', 'ref_offset', 'cit_text', 'ref_text' } Args: docs_path(str): Path to the training data directory e.g. data/TAC_2014_BiomedSumm_Training_Data json_data_path(str): Path to the json training file (v1-2a.json) Returns: dict with the above format ''' doc_mod = DocumentsModel(docs_path) docs = doc_mod.get_all() with codecs.open(json_data_path, 'rb', 'utf-8') as mf: data = json.load(mf) docs_new = {} # print docs.keys() # print docs.values()[0].keys() for tid, annotations in data.iteritems(): if tid not in docs_new: docs_new[tid] = {} for annotator_id, ann_list in annotations.iteritems(): for ann in ann_list: cit = ann['citance_number'] if cit not in docs_new[tid]: docs_new[tid][cit] = {} docs_new[tid][cit][annotator_id] = {} if 'ref_offset' not in docs_new[tid][cit][annotator_id]: docs_new[tid][cit][annotator_id]['ref_offset'] =\ ann['reference_offset'] else: docs_new[tid][cit][annotator_id]['ref_offset'] = union( docs_new[tid][cit][annotator_id]['ref_offset'] + ann['reference_offset']) if 'cit_offset' not in docs_new[tid][cit][annotator_id]: docs_new[tid][cit][annotator_id]['cit_offset'] =\ [ann['citation_offset']] else: docs_new[tid][cit][annotator_id]['cit_offset'] = union( docs_new[tid][cit][annotator_id]['cit_offset'] + [ann['citation_offset']]) docs_new[tid][cit][annotator_id]['ref_art'] = ann[ 'reference_article'] docs_new[tid][cit][annotator_id]['cit_art'] = ann[ 'citing_article'] for tid in docs_new: for cit in docs_new[tid]: for ann in docs_new[tid][cit]: docs_new[tid][cit][ann]['ref_text'] =\ [(s, doc_mod.get_doc(tid, docs_new[tid][cit][ann][ 'ref_art'].lower(), interval=s)) for s in docs_new[tid][cit][ann]['ref_offset']] cit_off = union(docs_new[tid][cit][ann]['cit_offset']) docs_new[tid][cit][ann]['cit_text'] =\ ' '.join([doc_mod.get_doc(tid, docs_new[tid][cit][ann][ 'cit_art'].lower(), intrvl) for intrvl in cit_off]) return docs_new
import math from sklearn.feature_extraction.text import TfidfVectorizer from nltk.corpus import stopwords from heapq import heappush, heappushpop try: import cPickle as pickle except: import pickle import os import re from util.tokenization import WordTokenizer, SentTokenizer from nltk.stem.porter import PorterStemmer from util.common import (hash_obj, VerbosePrinter, flatten) from constants import get_path cache_DIR = get_path()['cache'] class Summarizer(object): ''' Base class for summarizers ''' method_opts = {} def __init__(self, args=None, opts=None): """ Initialize the Summarizer. args is a list of arguments for the Summarizer (typically from input evaluate.py. opts is a ArgumentParser or OptionParser object. Notes
from rerank.null import Reranker as RerankInterface import json import codecs import os import sys from libs.evaluate import merge_offsets from libs.supervised.prep.prepare import Prep from constants import get_path, join_path from libs.supervised.classifiers.svm_rank import Supervised from util.common import hash_obj from util.cache import simple_caching, object_hashing from importlib import import_module import constants import operator path = get_path() STOPWORDS_PATH = path['data'] + '/stopwords.txt' CLF_PATH = join_path( path['root'], 'libs/supervised/classifiers') docs_path = join_path(path['data'], 'TAC_2014_BiomedSumm_Training_Data') json_data_path = join_path(path['data'], 'v1-2a.json') # root_proj_path = os.getcwd() # while not('.git' in os.listdir(root_proj_path)): # root_proj_path = os.path.split(root_proj_path)[0] # if not(root_proj_path in sys.path): # sys.path.append(root_proj_path) class Reranker(RerankInterface):