Exemple #1
0
def fit_on_data(dir_data=DIR_DATA, name_file=NAME_FILE):
    '''
    fit the model on given file with annotation 
    '''
    corpus = RepoModel(dir_data)  # load corpus
    doc = corpus.documents[name_file]  # get document with key
    bc = BertClient(ip='127.0.0.1',
                    port=8701,
                    port_out=8702,
                    show_server_config=True)  # bert model as service

    words, wordsvec, spans, wordslabel = words_vec_label(doc, bc)

    # wordsvec from list to array
    wordsvec = np.asarray(wordsvec)

    # label encoder
    wordslabel = [label[0] for label in wordslabel]
    # encode class values as integers
    encoder = LabelEncoder()
    encoder.fit(wordslabel)
    Y_encoder = encoder.transform(wordslabel)
    # convert integers to dummy variables (i.e. one hot encoded)
    Y_encoder = np_utils.to_categorical(Y_encoder)

    #X_train, X_test, Y_train, Y_test = train_test_split(wordsvec, Y_encoder, random_state=0)
    X_train, X_test, Y_train, Y_test = wordsvec, wordsvec, Y_encoder, Y_encoder

    # model define
    N_batch = 4
    N_epoch = 4
    en_verbose = 1
    input_dim = wordsvec.shape[1]
    N_classes = len(set(wordslabel))

    model = create_base_network(X_train[0].shape[0],
                                len(np.unique(wordslabel)))
    model.summary()

    # model training
    start = time.time()
    history = model.fit(X_train,
                        Y_train,
                        batch_size=N_batch,
                        epochs=N_epoch,
                        verbose=en_verbose,
                        validation_data=(X_test, Y_test))
    end = time.time()
    print('time elapse training:\t', end - start, 'sec')
    return model
Exemple #2
0
def main(data_dir, model_dir=None, exclude_normalize_tags=None, keys={}):
    '''
	data_dir -> path to brat annotation data. searches recursively
	model_dir -> path to save spacy training model
	exclude_normalize_tags -> list of tags to exclude from normalization. If NONE, no normalization is performed.
	keys -> dict translating brat tags to training tags. keys not in dict will be preserved
	'''

    r = RepoModel(data_dir, recursive=True, cached=False)

    nlp = spacy.load('en_default')

    # v1.1.2 onwards
    if nlp.tagger is None:
        print('---- WARNING ----')
        print('Data directory not found')
        print(
            'please run: `python -m spacy.en.download --force all` for better performance'
        )
        print('Using feature templates for tagging')
        print('-----------------')
        nlp.tagger = Tagger(nlp.vocab, features=Tagger.feature_templates)

    normalized_train_data = []
    excludes = exclude_normalize_tags  #we have manually tagged all instances of these

    for key, data in r.documents.items():
        if exclude_normalize_tags:
            normalized_train_data.extend(
                normalize_tags(nlp, get_annotated_sents(data, keys), excludes))
        else:
            normalized_train_data.extend(get_annotated_sents(data, keys))

    # print(normalized_train_data)

    nlp = train_ner(nlp, normalized_train_data, keys.values())

    doc = nlp(
        u"Hi Adam,\nSounds great to me. I'll send through the QA department. In the invite you through Skype, and we can discuss if Applause is right for you.\nI look forward to it!\nRegards,\nAndrew"
    )
    for word in doc:
        print(word.text, word.tag_, word.ent_type_)

    if model_dir is not None:
        save_model(nlp, model_dir)
Exemple #3
0
def convert(brat_dir_path: str, output_dir_path: str):
    # load the brat repository
    repo = RepoModel(brat_dir_path)
    print('Loaded {} document(s) from {}'.format(len(repo.documents),
                                                 brat_dir_path))

    for document_name in repo.documents:
        document = repo.documents[document_name]

        converter = DocConverter(document)
        sentences = converter.sentences

        with open(
                os.path.join(output_dir_path, '{}.json'.format(document_name)),
                'x') as output_file:
            json.dump(list(map(lambda s: s.to_dict(), sentences)),
                      output_file,
                      indent=2)
Exemple #4
0
def convert(brat_dir_path: str, output_file_path: str, verbose: bool = False):
    # load the brat repository
    repo = RepoModel(brat_dir_path)
    if verbose:
        print('Loaded {} document(s) from {}'.format(len(repo.documents),
                                                     brat_dir_path))

    # load the SLING commons store and the document schema
    commons = load_commons_store()
    schema = sling.DocumentSchema(commons)
    commons.freeze()

    writer = sling.RecordWriter(output_file_path)
    for document_name in repo.documents:
        document = repo.documents[document_name]
        reader = DocReader(document)
        converter = DocConverter(commons, schema, document_name)
        converter.convert(reader, writer)

    writer.close()
Exemple #5
0
        # print("sent:", sent)
        for token in word_tokenize(sent):
            # print("token:", token)
            ##            if len(token) > 1 and token not in contractions and not re.search("^[\W\d]+$", token) and token not in set(methods+strings+comments+operands+operators+variables+URLs):
            if not re.search("^[\W\d]+$", token) and token not in set(
                    methods + strings + comments + operands + operators +
                    variables + URLs):
                mywords.append(token)

    alltokens = set(methods + strings + comments + operands + operators +
                    mywords + variables + URLs)
    post["words"] = pythontagger.tag(list(alltokens))

# --------

r = RepoModel("../annotations")  # load repomodel
r.documents  # all documents in your brat corpus

filename = "9"
doc = r.documents[filename]  # get document with key 001
# print(doc.sentences)    			# a list of sentences in document
# print(doc.annotations)  			# the annotation objects in a document

# for word in doc.annotations:
#     print(word.repr, word.labels)

for filename, post in posts.items():
    doc = r.documents[filename]
    words = []
    for word in doc.annotations:
        words.append((word.repr, word.labels))
Exemple #6
0
        ANN_FILEs.append(file_name[:-4])

DIR_MODEL = './save/'
file_model_trig = DIR_MODEL + TASK_NAME + '_model_trigger.pkl'
file_model_arg = DIR_MODEL + TASK_NAME + '_model_arg.pkl'
bc = BertClient(ip='127.0.0.1',
                port=8701,
                port_out=8702,
                show_server_config=False)  # bert model as service
triggers, vec_trig, label_trig, args, vec_arg, label_arg = [], [], [], [], [], []
try:
    triggers, vec_trig, label_trig, args, vec_arg, label_arg = joblib.load(
        NAME_DATA_FILE)
    args, vec_arg, label_arg = None, None, None
except:
    corpus = RepoModel(DIR_DATA)  # load corpus
    for ANN_FILE in ANN_FILEs:
        doc = corpus.documents[ANN_FILE]  # get document with key
        ttriggers, tvec_trig, tlabel_trig, targs, tvec_arg, tlabel_arg, tlabel_arg_for_each_trig = get_events_in_mention(
            doc, bc)
        triggers.extend(ttriggers)
        vec_trig.extend(tvec_trig)
        label_trig.extend(tlabel_trig)
        args.extend(targs)
        vec_arg.extend(tvec_arg)
        label_arg.extend(tlabel_arg)

    print('trigs:', len(vec_trig), 'args:', len(vec_arg))
    joblib.dump([triggers, vec_trig, label_trig, args, vec_arg, label_arg],
                NAME_DATA_FILE)
    args, vec_arg, label_arg = None, None, None
Exemple #7
0
def corpus_save(dir_data, out_data):
    corpus = RepoModel(dir_data)
    corpus.save_xml(out_data)
Exemple #8
0
def training(DIR_DATA):
    print('\ndata importing:')
    TASK_NAME = DIR_DATA
    NAME_DATA_FILE = TASK_NAME + '_data_import' + '.save'

    # obtain all the files list
    ANN_FILEs = []
    DIR_ALL_FILES = os.listdir(DIR_DATA)
    for file_name in DIR_ALL_FILES:
        if file_name.split('.')[-1] == 'txt':
            ANN_FILEs.append(file_name[:-4])

    DIR_MODEL = './save_Eng/'
    file_model_trig = DIR_MODEL + TASK_NAME + '_model_trigger.pkl'
    file_model_arg = DIR_MODEL + TASK_NAME + '_model_arg.pkl'
    triggers, vec_trig, label_trig, args, vec_arg, label_arg = [], [], [], [], [], []
    try:
        triggers, vec_trig, label_trig, args, vec_arg, label_arg = joblib.load(
            NAME_DATA_FILE)
        args, vec_arg, label_arg = None, None, None
    except:
        corpus = RepoModel(DIR_DATA)  # load corpus
        for ANN_FILE in ANN_FILEs:
            bc = BertClient(ip='127.0.0.1',
                            port=8701,
                            port_out=8702,
                            show_server_config=False)  # bert model as service
            doc = corpus.documents[ANN_FILE]  # get document with key
            ttriggers, tvec_trig, tlabel_trig, targs, tvec_arg, tlabel_arg, tlabel_arg_for_each_trig = get_events_in_mention(
                doc, bc)
            triggers.extend(ttriggers)
            vec_trig.extend(tvec_trig)
            label_trig.extend(tlabel_trig)
            args.extend(targs)
            vec_arg.extend(tvec_arg)
            label_arg.extend(tlabel_arg)

        print('trigs:', len(vec_trig), 'args:', len(vec_arg))
        joblib.dump([triggers, vec_trig, label_trig, args, vec_arg, label_arg],
                    NAME_DATA_FILE)
        args, vec_arg, label_arg = None, None, None

    print('=' * 65, '\n>>trigger model training:')
    try:
        model_trig, encoder_trig = joblib.load(file_model_trig)
        acc_pre = test_on_data(model_trig,
                               encoder_trig,
                               vec_trig,
                               label_trig,
                               en_verbose=0)
    except:
        # model define
        input_dim = np.asarray(vec_trig).shape[1]
        N_classes = len(set(label_trig))
        model_trig = create_base_network(input_dim, N_classes)
        encoder_trig = LabelEncoder()
        encoder_trig.fit(label_trig)
        acc_pre = 0

    N_batchs = [
        len(label_trig), 8192, 4096, 2048, 1024, 512, 32, 16, 8, 4, 2, 1
    ]
    lrs = [0.001, 0.00001]
    for N_batch in N_batchs:
        for lr in lrs:
            Times_training, N_batch, N_epoch, en_verbose = 3, N_batch, max(
                16, int(np.floor(np.sqrt(10 * N_batch)))), 1
            for times in range(1, Times_training):
                the_lr = lr / times
                model_trig, encoder_trig, his = fit_on_data(
                    vec_trig,
                    label_trig,
                    model_trig,
                    encoder_trig,
                    the_lr,
                    N_batch=N_batch,
                    N_epoch=N_epoch,
                    en_verbose=en_verbose)
                print('acc:{}'.format(his.history['acc'][-1]))
                val_acc = test_on_data(model_trig,
                                       encoder_trig,
                                       vec_trig,
                                       label_trig,
                                       en_verbose=en_verbose)
                joblib.dump([model_trig, encoder_trig],
                            '{}_{:.5f}_{:.5f}_{:.5f}_{:.5f}.pkl'.format(
                                file_model_trig[0:-4], his.history['acc'][-1],
                                val_acc, the_lr,
                                N_batch))  # save the model to disk
                if val_acc > acc_pre:
                    acc_pre = val_acc
                    joblib.dump([model_trig, encoder_trig], '{}.pkl'.format(
                        file_model_trig[0:-4]))  # save the model to disk
                else:
                    break
    return

    print('=' * 65, '\n>>argument model training:')
    try:
        triggers, vec_trig, label_trig = None, None, None
        triggers, vec_trig, label_trig, args, vec_arg, label_arg = joblib.load(
            NAME_DATA_FILE)
        triggers, vec_trig, label_trig = None, None, None
        model_arg, encoder_arg = joblib.load(file_model_arg)
        acc_pre = test_on_data(model_arg,
                               encoder_arg,
                               vec_arg,
                               label_arg,
                               en_verbose=0)
    except:
        encoder_arg = LabelEncoder()
        encoder_arg.fit(label_arg)
        # model define
        input_dim = np.asarray(vec_arg).shape[1]
        N_classes = len(set(label_arg))
        model_arg = create_base_network(input_dim, N_classes)
        acc_pre = 0

    for lr in lrs:
        for N_batch in N_batchs:
            Times_training, N_batch, N_epoch, en_verbose = 3, N_batch, max(
                16, int(np.floor(np.sqrt(10 * N_batch)))), 1
            for times in range(1, Times_training):
                the_lr = lr / times
                model_arg, encoder_arg, his = fit_on_data(
                    vec_arg,
                    label_arg,
                    model_arg,
                    encoder_arg,
                    the_lr,
                    N_batch=N_batch,
                    N_epoch=N_epoch,
                    en_verbose=en_verbose)
                print('acc:{}'.format(his.history['acc'][-1]))
                val_acc = test_on_data(model_arg,
                                       encoder_arg,
                                       vec_arg,
                                       label_arg,
                                       en_verbose=en_verbose)
                joblib.dump([model_arg, encoder_arg],
                            '{}_{:.5f}_{:.5f}_{:.5f}_{:.5f}.pkl'.format(
                                file_model_arg[0:-4], his.history['acc'][-1],
                                val_acc, the_lr,
                                N_batch))  # save the model to disk
                if val_acc > acc_pre:
                    acc_pre = val_acc
                    joblib.dump([model_arg, encoder_arg], '{}.pkl'.format(
                        file_model_arg[0:-4]))  # save the model to disk
                else:
                    break
Exemple #9
0
def _enrich_json_with_text(j_dic, txt_file_path, raw_text=None):
    if raw_text is not None:
        # looks like somebody read this already; nice
        text = raw_text
    else:
        # need to read raw text
        try:
            (filepath, tempfilename) = os.path.split(txt_file_path)
            (filename, extension) = os.path.splitext(tempfilename)
            r = RepoModel(filepath)
            r.save_xml(filepath)
            # xml_save(filepath, filename, filename)
            xml_file_path = os.path.join(filepath, filename+'.xml')
            # print("xml_file_path::::", r, file=sys.stderr)
            # if xml_file_path:
            #     pass
            # else:
            #     xml_save(filepath, filename, filename)
            with open(xml_file_path, 'r') as xml_file:
                xml = xml_file.read()
            with open_textfile(txt_file_path, 'r') as txt_file:
                text = txt_file.read()
            j_dic['xml'] = xml

        except IOError:
            raise UnableToReadTextFile(txt_file_path)
        except UnicodeDecodeError:
            Messager.error(
                'Error reading text file: nonstandard encoding or binary?', -1)
            raise UnableToReadTextFile(txt_file_path)

    j_dic['text'] = text


    tokeniser = options_get_tokenization(dirname(txt_file_path))

    # First, generate tokenisation
    if tokeniser == 'mecab':
        from tokenise import jp_token_boundary_gen
        tok_offset_gen = jp_token_boundary_gen
    elif tokeniser == 'whitespace':
        from tokenise import whitespace_token_boundary_gen
        tok_offset_gen = whitespace_token_boundary_gen
    elif tokeniser == 'ptblike':
        from tokenise import gtb_token_boundary_gen
        tok_offset_gen = gtb_token_boundary_gen
    else:
        Messager.warning('Unrecognized tokenisation option '
                         ', reverting to whitespace tokenisation.')
        from tokenise import whitespace_token_boundary_gen
        tok_offset_gen = whitespace_token_boundary_gen
    j_dic['token_offsets'] = [o for o in tok_offset_gen(text)]

    ssplitter = options_get_ssplitter(dirname(txt_file_path))
    if ssplitter == 'newline':
        from ssplit import newline_sentence_boundary_gen
        ss_offset_gen = newline_sentence_boundary_gen
    elif ssplitter == 'regex':
        from ssplit import regex_sentence_boundary_gen
        ss_offset_gen = regex_sentence_boundary_gen
    else:
        Messager.warning('Unrecognized sentence splitting option '
                         ', reverting to newline sentence splitting.')
        from ssplit import newline_sentence_boundary_gen
        ss_offset_gen = newline_sentence_boundary_gen
    j_dic['sentence_offsets'] = [o for o in ss_offset_gen(text)]

    return True
Exemple #10
0
from pathlib import Path
import nltk
from data_collector import BratDataCollector
from bratreader.repomodel import RepoModel
from classification import Classification
from deeppavlov import build_model, configs
import operator

# will this path to brat repository be the field of UI?
brat_folder = Path('D:\\Диплом\\prog\\essays\\original')
# brat_folder = Path('D:\\Диплом\\prog\\essays\\russian')

brat_reader = RepoModel(brat_folder)

collector = BratDataCollector(brat_reader)
data = collector.collect_data()


def get_sentiment_statistic(data, language, deeppavlov_model):
    from data_manager import DataManager
    correct_labels = ['Premise', 'Claim']
    ru_sentiment = ['positive', 'neutral', 'negative']
    en_sentiment = ['Positive', 'Neutral', 'Negative']
    cur_sentiment = []
    if language == 'ru':
        cur_sentiment = ru_sentiment
    if language == 'en':
        cur_sentiment = en_sentiment

    args = DataManager().filter_labels(data, correct_labels)
    all_premises = 0
import classifier as classifier
import nltk
import bratreader
import Splitter
import POSTagger

from nltk.corpus import movie_reviews, LazyCorpusLoader, CategorizedPlaintextCorpusReader

from DictionaryTagger import DictionaryTagger
from bratreader.repomodel import RepoModel

reader = RepoModel("bratessays")  # load repomodel
reader.documents

doc = reader.documents["essay01"]  # get document with key 001
print(doc.sentences)  # a list of sentences in document
print(doc.annotations)  # the annotation objects in a document

text = """It is always said that competition can effectively promote the development of economy. In order to survive in the competition, companies continue to improve their products and service, and as a result, the whole society prospers. However, when we discuss the issue of competition or cooperation, what we are concerned about is not the whole society, but the development of an individual's whole life. From this point of view, I firmly believe that we should attach more importance to cooperation during primary education.
First of all, through cooperation, children can learn about interpersonal skills which are significant in the future life of all students. What we acquired from team work is not only how to achieve the same goal with others but more importantly, how to get along with others. During the process of cooperation, children can learn about how to listen to opinions of others, how to communicate with others, how to think comprehensively, and even how to compromise with other team members when conflicts occurred. All of these skills help them to get on well with other people and will benefit them for the whole life.
On the other hand, the significance of competition is that how to become more excellence to gain the victory. Hence it is always said that competition makes the society more effective. However, when we consider about the question that how to win the game, we always find that we need the cooperation. The greater our goal is, the more competition we need. Take Olympic games which is a form of competition for instance, it is hard to imagine how an athlete could win the game without the training of his or her coach, and the help of other professional staffs such as the people who take care of his diet, and those who are in charge of the medical care. The winner is the athlete but the success belongs to the whole team. Therefore without the cooperation, there would be no victory of competition.
Consequently, no matter from the view of individual development or the relationship between competition and cooperation we can receive the same conclusion that a more cooperative attitudes towards life is more profitable in one's success."""
"""
splitter = Splitter.Splitter()
postagger = POSTagger.POSTagger()

splitted_sentences = splitter.split(text)

print(splitted_sentences)

pos_tagged_sentences = postagger.pos_tag(splitted_sentences)
class DataProcess(object):

    reader = RepoModel(bratessayFolder)  # load repomodel

    def __init__(self):
        pass

    def ProcessData(self):
        for i in range(1, 80):
            doc = reader.documents["essay" + str(i)]
            dataObjectList = []
            annotatedData = set(doc.annotations)
            for annotation in annotatedData:
                # print("annotation :", annotation.repr)
                # print("labels :", annotation.labels.items())
                # print("links :", annotation.links)
                # print("********************************************************************************")

                dataObject = {"annotation": annotation.repr,
                              "labels": annotation.labels.items(),
                              "links": annotation.links}

                dataObjectList.append(dataObject)

            data = self.ExtractDataFeatures(dataObjectList, doc.key)

        return data



    def ExtractDataFeatures(self,data,key):

        full = {}
        wholeobject = []
        labs = []
        links = []
        for items in data:
            sentence = items['annotation']
            full['sentence'] = sentence
            for label in items['labels']:
                for lab in label:
                    if lab != None and lab != []:
                        labs.append(lab)
            full['label'] = labs
            labs = []
            for link in items['links'].iteritems():
               lnk = link[0]
               for x in link[1]:
                   linkk = {lnk:x.repr}
                   links.append(linkk)
               full['links'] = links
               links = []

            wholeobject.append(full)
            full = {}

        completeset.append({key:wholeobject})
        return completeset



    def ClassifyArguments(self,dataset):
        ArgumentComponent = []
        for essay in dataset:
            for item in essay.values():
                for x in item:
                    for label in x['label']:
                        if label == 'Claim' or label == 'MajorClaim' or label == 'Premise':
                            filteredObj = (label, x['sentence'])
                            ArgumentComponent.append(filteredObj)


        return ArgumentComponent



    def ClassifyLinks(self,data):
        Links = []
        for essay in data:
            for item in essay.values():
                for x in item:
                    for link in x:
                        if link == 'links':
                            for stance in  x['links']:
                                for y in stance.items():
                                    filteredObj = (y)
                                    Links.append(filteredObj)

        return Links



    def getFilteredWords(self,Components,links = None):

        # provide supporting and attacking argumetns with links if required
        sentences = []
        for (sentiment,words) in Components:
            words_filtered = [e.lower() for e in words.split() if len(e) >= 3]
            sentences.append((words_filtered,sentiment))

        return sentences


    def get_words_in_doc(self,sentences):
        all_words = []
        for (words, sentiment) in sentences:
            all_words.extend(words)
        return all_words


    def get_word_features(self,wordlist):
        wordlist = nltk.FreqDist(wordlist)
        word_features = wordlist.keys()
        return word_features

#--------------------------------------------Test data extraction --------------------------------------#

    def getTestData(self,key):

        doc = reader.documents[key]
        dataObjectList = []
        annotatedData = set(doc.annotations)
        for annotation in annotatedData:
                # print("annotation :", annotation.repr)
                # print("labels :", annotation.labels.items())
                # print("links :", annotation.links)
                # print("********************************************************************************")

            dataObject = {"annotation": annotation.repr,
                              "labels": annotation.labels.items(),
                              "links": annotation.links}

            dataObjectList.append(dataObject)

        data = self.ExtractTestSentences(dataObjectList)

        return data


    def ExtractTestSentences(self,data):

        sentences = []
        for items in data:
            sentence = items['annotation']
            sentences.append(sentence)

        return sentences


    def getTestAccuracyData(self):
        classification = Classification.Classification()

        for i in range(80, 90):
            doc = reader.documents["essay" + str(i)]
            dataObjectList = []
            annotatedData = set(doc.annotations)
            for annotation in annotatedData:

                dataObject = {"annotation": annotation.repr,
                              "labels": annotation.labels.items(),
                              "links": annotation.links}

                dataObjectList.append(dataObject)

            data = self.ExtractDataFeatures(dataObjectList, doc.key)

        preTrainingData = classification.prepareTrainingData(data)  # arguments and links
        Arguments = preTrainingData[0]
        Links = preTrainingData[1]

        Arg_word_features = classification.getWordFeatures(Arguments)
        Link_word_features = classification.getWordFeatures(Links)

        classification.setWordfeatureSet(Arg_word_features)
        ArgumentTesting_set = nltk.classify.apply_features(classification.extract_features, Arguments)

        classification.setWordfeatureSet(Link_word_features)
        LinksTesting_set = nltk.classify.apply_features(classification.extract_features, Links)

        return [ArgumentTesting_set,LinksTesting_set]

#------------------------------------------utilities -------------------------------------#

    def getPathToFile(self,RelativePath):
        dir = os.getcwd()
        ROOT_DIR = os.path.dirname(os.path.abspath(dir))
        folder = os.path.join(ROOT_DIR, RelativePath)
        return folder
import nltk
import os
from bratreader.repomodel import RepoModel
from Utils import Utils
import Classification

bratessayFolder = Utils().getPathToFile('bratessays')

reader = RepoModel(bratessayFolder)

#doc = reader.documents["essay01"]		# get document with key 001
#print("sentences",doc.sentences)    			# a list of sentences in document
#print("annotation :",doc.annotations)       # the annotation objects in a documennt

completeset = []

class DataProcess(object):

    reader = RepoModel(bratessayFolder)  # load repomodel

    def __init__(self):
        pass

    def ProcessData(self):
        for i in range(1, 80):
            doc = reader.documents["essay" + str(i)]
            dataObjectList = []
            annotatedData = set(doc.annotations)
            for annotation in annotatedData:
                # print("annotation :", annotation.repr)
                # print("labels :", annotation.labels.items())
Exemple #14
0
#test
# TEST_DATA = ('data/test/')
#TEST_DATA = ('data_chinese/test/')
TEST_DATA = ('/home/linbo/Downloads/Annotation/military-corpus/')
TEST_FILEs = []
TEST_ALL_FILES = os.listdir(TEST_DATA)
for test_file_name in TEST_ALL_FILES:
    if test_file_name.split('.')[-1] == 'txt':
        TEST_FILEs.append(test_file_name[:-4])
# print(TEST_FILEs)
test_triggers, test_vec_trig, test_label_trig, test_args, test_vec_arg, test_label_arg = [], [], [], [], [], []
test_text = []
test_line = []
test_label_arg_for_each_trig = []
test_corpus = RepoModel(TEST_DATA)  # load corpus
for TEST_FILE in TEST_FILEs:
    test_doc = test_corpus.documents[TEST_FILE]  # get document with key
    test_ttriggers, test_tvec_trig, test_tlabel_trig, test_targs, test_tvec_arg, test_tlabel_arg, test_tlabel_arg_for_each_trig = get_events_in_mention(
        test_doc, bc)
    test_triggers.append(test_ttriggers)
    test_vec_trig.append(test_tvec_trig)
    test_label_trig.append(test_tlabel_trig)
    test_args.append(test_targs)
    test_vec_arg.append(test_tvec_arg)
    test_label_arg.append(test_tlabel_arg)
    test_label_arg_for_each_trig.append(test_tlabel_arg_for_each_trig)

    test_text.append(test_doc.text)

    for sent in test_doc.sentences:
Exemple #15
0
    def __init__(self):
        self.LinkType = ""
        self.DrugRepr = ""
        self.OtherRepr = ""
        self.textBetween = ""
        self.text_before = ""
        self.text_after = ""
        self.drug_start = 0
        self.drug_end = 0
        self.other_start = 0
        self.other_end = 0
        self.isPositive = False


source = sys.argv[1]
r = RepoModel(source)
# print(r.documents )
myfile = open("DrugInteractionCSV15.arff", 'w')
# wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
# wr.writerow(["LinkType","DrugRepr","OtherRepr","BetweenLength","NumTokensBetween","IsPositive"])
myfile.write('''@RELATION wordcounts

@ATTRIBUTE LinkType string
@ATTRIBUTE DrugRepr string
@ATTRIBUTE OtherRepr string
@ATTRIBUTE BetweenLength numeric
@ATTRIBUTE NumTokensBetween numeric
@ATTRIBUTE betweenText string
@ATTRIBUTE isPositive {True,False}

@DATA
Exemple #16
0
 def __init__(self, data_dir):
     self.data_dir = data_dir
     self.corpus = RepoModel(data_dir)
     self.documents = self.corpus.documents
     self.sentences = self.load_sentences()