Esempio n. 1
0
 def train_network(self):
     logger = logging.getLogger('progress_logger')
     logger.info("Training neural network")
     if self.train_path:
         train_documents = None
         validation_documents = None
         if self.retrain:
             logger.info("Reading documents")
             train_documents = data.read_all(self.train_path)
             validation_documents = data.read_all(self.validation)
         logger.info("Started training")
         if self.global_norm:
             model = global_norm_nn.GlobalNormNN(
                 train_documents,
                 validation_documents,
                 self.retrain,
                 self.model_name,
                 pretrained_base=self.pretrained_base)
         else:
             model = classification.NNActions(train_documents,
                                              validation_documents,
                                              self.retrain, self.model_name)
         return model
     else:
         raise Exception("No path to training corpus provided")
def complete_base():
    data.read_all(test_path)
    bp = BaseProcedure(train_path=train_path,
                       token_window=token_window,
                       retrain_rel=retrain_REL,
                       retrain_dct=retrain_DCT,
                       doc_time_path=DCT_model_name,
                       rel_classifier_path=relation_model_name,
                       greedy=greedy,
                       transitive=transitive,
                       linear=linear)
    # Where the magic happens
    bp.predict(test_path)
    bp.evaluate(test_path)
Esempio n. 3
0
    def __init__(self,
                 train_path="",
                 validation_path="",
                 retrain_rel=False,
                 retrain_dct=False,
                 rel_classifier_path="",
                 doc_time_path="",
                 token_window=30,
                 greedy=False,
                 transitive=False,
                 linear=True):
        """
        :param train_path: Path to training corpus (not required if models don't need to be retrained)
        :param token_window: Window in which candidates need to be generated
        :param rel_classifier_path: Path to Binary relation classification (YES/NO)
        :param doc_time_path: Path to Doctime classifier
        :param greedy: TRUE: use greedy decision making on binary classifications. FALSE: use ILP inference
        :param transitive: Close data transitively before training and
        """
        self.train_path = train_path
        self.transitive = transitive
        self.token_window = token_window
        self.greedy = greedy
        self.doctimepath = doc_time_path
        self.relpath = rel_classifier_path
        if greedy:
            self.annotator = GreedyAnnotator(token_window=token_window)
        else:
            self.annotator = InferenceAnnotator(token_window=token_window,
                                                transitive=transitive)

        if retrain_dct:
            self.doc_time_model = self.train_doctime(doc_time_path, linear)
        else:
            self.doc_time_model = utils.load_model(doc_time_path)
        if retrain_rel:
            self.annotator.model = self.train_rel_classifier(
                rel_classifier_path)
        else:
            self.annotator.model = utils.load_model(rel_classifier_path)
        # evaluation
        if validation_path:
            docs = data.read_all(validation_path, transitive=transitive)
            dct = os.path.join(utils.model_path, doc_time_path + "_eval.txt")
            rel = os.path.join(utils.model_path,
                               rel_classifier_path + "_eval.txt")
            with open(dct, 'w+') as file:
                eval_str = self.doc_time_model.evaluate(docs)
                print(eval_str)
                file.write(eval_str)
            with open(rel, 'w+') as file:
                eval_str = self.annotator.model.evaluate(docs)
                print(eval_str)
                file.write(eval_str)
Esempio n. 4
0
 def train_rel_classifier(self, save_path, validation=""):
     logger = logging.getLogger('progress_logger')
     logger.info("Training relation classifier")
     if self.train_path:
         logger.info("Reading documents")
         train_documents = data.read_all(self.train_path,
                                         transitive=self.transitive)[:50]
         logger.info("Started training")
         model = classification.train_relation_classifier(
             train_documents, self.token_window)
         utils.save_model(model, save_path)
         return model
     else:
         raise Exception("No path to training corpus provided")
Esempio n. 5
0
 def train_doctime(self, save_path, linear):
     logger = logging.getLogger('progress_logger')
     logger.info("Training doctime classifier")
     if self.train_path:
         logger.info("Reading documents")
         train_documents = data.read_all(self.train_path,
                                         transitive=self.transitive)
         logger.info("Started training")
         model = classification.train_doctime_classifier(train_documents,
                                                         linear=linear)
         utils.save_model(model, name=save_path)
         return model
     else:
         raise Exception("No path to training corpus provided")
Esempio n. 6
0
data_path = os.path.abspath(
    ARGS.path
)  # преобразование в абсолютный путь # os.path.join() - нужно использовать при назначении относительного пути.

print('DATA PATH: ' + str(data_path))

logging.debug('First: {0.first} Last: {0.last}'.format(
    ARGS))  # дополнительные возможности форматирования строки.
logging.debug('Data path: {0.path} ({1})'.format(ARGS, data_path))
logging.debug('Nodes: {0.nodes} Links: {0.links}'.format(ARGS))
logging.info('Test')
logging.warning('Test')
logging.error('Test')
logging.critical('Test')

Graph, Names = data.read_all(data_path, ARGS.nodes, ARGS.links)

First = Names[ARGS.first]['number']
Last = Names[ARGS.last]['number']

for path in data.in_depth(Graph, First, Last):
    print(path)
"""
# Чтение данных в формате csv из менеджера контекста
with open(nodes_path, "rt", encoding="utf-8") as src:
        rdr = csv.reader(src)
        for number, name in rdr: #Можно использовать data вместо number, name чтобы получить последовательность списков.
            print(number, name)
"""

# Присваиваю переменной значение открытия файла.
Esempio n. 7
0
from __future__ import division
import math
import data
import oracle
import utils
"""
Script for testing attributes of the datasets
"""

dev = data.read_all(utils.dev)
train = data.read_all(utils.train)


def treeless(documents):
    treefull = 0
    all = 0
    treeless = 0
    for document in documents:
        relations = document.relation_mapping
        parents = [x for (x, _) in relations.keys()]
        children = [x for (_, x) in relations.keys()]
        # how many relations in total are there?
        all += len(children)
        # how many children are not unique = how many children have multiple parents
        treefull += len(children) - len(set(children))
        # how many parents are not unique = how many parents have multiple children
        treeless += len(parents) - len(set(parents))
    print(treeless / all, treefull / all, all)


def samepar_relations(documents):
Esempio n. 8
0
                x = indices.index(ind)
                del indices[x]
                del distribution[x]
        return None


def get_training_sequence(entities, arcs, doc):
    # Given entities and arcs, yield the sequence of configuration and actions needed to get from the intitial
    # configuration to the terminal one
    # Is used to determine the training sequence of a document
    configuration = Configuration(entities, doc)
    oracle = KnowingOracle(arcs)

    while not configuration.empty_buffer():
        function_string = oracle.next_step(configuration)
        conf_copy = cPickle.loads(cPickle.dumps(configuration, -1))
        yield (conf_copy, function_string)
        # applies function to configuration
        getattr(configuration, function_string)()


if __name__ == '__main__':
    # Test methods
    documents = read_all(utils.dev, transitive=False)
    for doc in documents:
        sequence = get_training_sequence(doc.get_entities(),
                                         doc.get_relations(), doc)
        # Should print equal amounts
        print(len(doc.get_relations()),
              len([x for x in sequence if x[1] in ["left_arc", "right_arc"]]))