Esempio n. 1
0
    def __init__(self, start_train, end_train, start_test, end_test):
        self.sentences = vpe.AllSentences()
        self.annotations = vpe.Annotations()
        self.file_names = Files()
        self.all_auxiliaries = vpe.Auxiliaries()
        self.gold_standard_auxs = vpe.Auxiliaries()

        self.hyperplane = None
        self.features = []
        """ Train and test_vectors are lists of csr_matrices in order to save memory. """
        self.m = None
        self.m2 = None
        self.train_vectors = []
        self.train_classes = []
        self.test_vectors = []
        self.test_classes = []
        self.predictions = []
        self.result_vector = []

        self.pre_oversample_length = 0

        self.start_train = start_train
        self.end_train = end_train
        self.start_test = start_test
        self.end_test = end_test
Esempio n. 2
0
from numpy import dot
from copy import copy, deepcopy
from random import shuffle

MODALS = [
    'can', 'could', 'may', 'must', 'might', 'will', 'would', 'shall', 'should'
]
BE = ['be']
HAVE = ['have']
DO = ['do']
TO = ['to']
SO = ['so', 'same', 'likewise', 'opposite']

AUX_LEMMAS = MODALS + BE + HAVE + DO + TO + SO
ALL_CATEGORIES = [MODALS, BE, HAVE, DO, TO, SO]
ALL_AUXILIARIES = Files().extract_data_from_file(Files.UNIQUE_AUXILIARIES_FILE)
EMPTY_DEP = 'NONE'
""" ---- Exception classes. ---- """


class AuxiliaryHasNoTypeException(BaseException):
    def __init__(self, aux_name):
        print 'The following auxiliary, %s, has no category!' % aux_name


class EmptySentDictException(BaseException):
    def __init__(self):
        pass


class GoldStandardComesFromRawException(BaseException):
Esempio n. 3
0
import word_characteristics as wc
import numpy as np
import nltktree as nt
import warnings
from file_names import Files
from os import listdir
from sys import argv
from sklearn.cross_validation import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.ensemble import RandomForestClassifier
from scipy.sparse import csr_matrix, vstack

files = Files()
MRG_DATA_FILE = 'dataset_with_features_ALL_AUXS.npy'
AUTO_PARSE_FILE = '../npy_data/auto_parse_with_features_FULL_DATASET.npy'
AUTO_PARSE_XML_DIR = '/Users/kian/Documents/HONOR/xml_annotations/raw_auto_parse/'


class Dataset(object):
    def __init__(self):
        self.sentences = []
        self.auxs = []
        self.gold_auxs = []
        self.X = []
        self.Y = []
        self.section_ends = {k: -1 for k in range(0, 25)}

    def add(self, section):