コード例 #1
0
def read_conllxi(path):
    lines = file_util.read_line_list(path, f_encoding='UTF-8')
    sentences = []
    temp = []
    for line in lines:
        if line == "":
            sentences.append(sentence.Sentence(temp))
            temp = []
        else:
            temp.append(line)

    if len(temp) != 0:
        sentences.append(sentence.Sentence(temp))

    return sentences
コード例 #2
0
 def process_srt(self,filename):
     with open(filename, mode='r') as file_srt:
         subs = file_srt.read()
     subs_buffer = subs.split('\n')
     for idx, val in enumerate(subs_buffer):
         if val.isdigit():
             time_label = subs_buffer[idx+1].split(" --> ")
             time_label_start = time_label[0].split(',')[0]
             time_label_start_remainder = time_label[0].split(',')[1]
             time_label_end = time_label[1].split(',')[0]
             time_label_end_remainder = time_label[1].split(',')[1]
             sub_text = subs_buffer[idx+2]
             sub_text = self.sentence_cleaning(sub_text)
             if sub_text !="" and idx+3<len(subs_buffer) :
                 # print len(subs_buffer)
                 # print idx
                 if subs_buffer[idx+3]!="" :
                     sub_text=sub_text+" "+subs_buffer[idx+3]
                     idx=idx+5
                 else:
                     idx=idx+4
                 new_sentence = sentence.Sentence(time_label_start,int(time_label_start_remainder), time_label_end,int(time_label_end_remainder),sub_text)
                 if len(self.sentences) > 0 and self.sentences[-1].time_end != new_sentence.time_start:
                     self.sentences[-1].time_end = new_sentence.time_start
                     self.sentences[-1].sample_end = new_sentence.sample_start
                 self.sentences.append(new_sentence)
コード例 #3
0
ファイル: translation_database.py プロジェクト: Mindful/tldb
 def create_sentences(self, sentence_tuple_list):
     c = self.__get_cursor()
     c.executemany(
         "INSERT INTO " + self.sentence_table_name + " VALUES (?,?,?,?)",
         sentence_tuple_list)
     self.db.commit()
     return [sentence.Sentence(*x) for x in sentence_tuple_list]
コード例 #4
0
ファイル: filterabstracts.py プロジェクト: olabknbit/acres
def keepForDiabetesCorpus(xmldoc):
    """ Return True if we should keep this abstract for the diabetes corpus
        Include abstract in diabetes corpus if it contains at least one cost value or term.
    """
    abstractNodes = xmldoc.getElementsByTagName('Abstract')
    if abstractNodes is None or len(abstractNodes) == 0:
        return False

    textNodeList = abstractNodes[0].getElementsByTagName('AbstractText')
    if textNodeList is None or len(textNodeList) == 0:
        return False

    nCostValues = 0
    nCostTerms = 0
    tokenCount = 0
    cueLemmaSet = {"cost", "QALY", "QALYs"}

    for textNode in textNodeList:
        text = xmlutil.getText(textNode)
        sentenceList = sentenceSplitter.tokenize(text)
        for sText in sentenceList:
            tokenTextList = tokenizer.tokenize(sText)
            tokenList = tokenlist.TokenList()
            tokenList.convertStringList(tokenTextList)
            s = sentence.Sentence(tokenList)
            for token in s:
                tokenCount += 1
                lemmatizeabstracts.lemmatizeToken(token)
                if token.lemma in cueLemmaSet or token.text.find('cost') >= 0:
                    nCostTerms += 1
                if cvFinder.tokenIsCostValue(token):
                    nCostValues += 1

    return (nCostValues > 0 or nCostTerms > 0) and tokenCount > 100
コード例 #5
0
ファイル: translation_database.py プロジェクト: Mindful/tldb
 def get_sentence(self, content_id, language, sentence_number):
     c = self.__get_cursor()
     c.execute(
         "SELECT * FROM " + self.sentence_table_name +
         " WHERE content_id=? and language=? and sentence_number=?",
         (content_id, language, sentence_number))
     sentence_data = c.fetchone()
     if sentence_data:
         return sentence.Sentence(*sentence_data)
     else:
         return None
コード例 #6
0
    def unplagarize(self):
        for x in xrange(0, len(self.sentences) / 2, 2):
            self.final_sentences.append(
                sentence.Sentence(self.sentences[x],
                                  self.sentences[x + 1]).unplagarize)
        print self.sentences
        print self.final_sentences

        full_paragraph = self.final_sentences[0]
        for x in xrange(1, len(self.final_sentences)):
            full_paragraph = full_paragraph + " " + self.final_sentences[x]
        return full_paragraph
コード例 #7
0
ファイル: model.py プロジェクト: speedcell4/dmvccm
    def __init__(self, treebank=None):

        treebank = self._get_treebank(treebank)

        S, Gold = [], []
        for t in treebank.get_trees():
            s = sentence.Sentence(t.leaves())
            S += [s]
            # Gold += [depset.deptree_to_depset(t)]
            Gold += [t.depset]

        self.S = S
        self.Gold = Gold
コード例 #8
0
def main():
    """
    baseline system which tag only with gazette
    """
    dic, max_key_len = gazette.load(_OPTS.gazette)

    json_obj = json.load(sys.stdin)
    for sent_obj in json_obj['sentence']:
        sent = sentence.Sentence(sent_obj)
        sent.tag_nes(dic, max_key_len)
        filtered = [_.json_obj for _ in _filter_dic_nes(sent.dic_nes)]
        sent_obj['NE'] = filtered
    json.dump(json_obj, sys.stdout, ensure_ascii=False, indent=2)
コード例 #9
0
def main():
    """
    convert from JSON to CRFsuite feature format
    """
    dic, max_key_len = gazette.load(_OPTS.gazette)

    json_obj = json.load(sys.stdin)
    for sent_obj in json_obj['sentence']:
        sent = sentence.Sentence(sent_obj)
        sent.tag_nes(dic, max_key_len)
        for morp in sent.morps:
            features = feature.get_all_feat(sent, morp.id())
            print('%s\t%s' % (sent.label(morp.id()), '\t'.join(features)))
        print()
コード例 #10
0
	def make_sentences(self):
		'''Make a sentence from all consecutive words from 1 speaker between eol (. ! ?).'''
		if verbose: print('creating sentences, words between eols')
		self.sentences = []
		sentence_wl = []
		sentence_index = 0
		for w in self.words:
			sentence_wl.append(w)
			if w.eol:
				self.sentences.append(sentence.Sentence(sentence_wl,sentence_index))
				if self.corpus == 'CGN':
					self.sentences[-1].overlap_unknown = False
				sentence_index += 1
				sentence_wl = []
		self.nsentences = len(self.sentences)
コード例 #11
0
def main(w2v_path, model_path):
    """
    tag person(PS) with SVM classifier
    :param  w2v_path:    word2vec file path
    :param  model_path:  model path
    """
    w2v_dic = word2vec.load(w2v_path)
    svm_model = cPickle.load(open(model_path, 'rb'))

    json_obj = json.load(sys.stdin)
    for sent_obj in json_obj['sentence']:
        sent = sentence.Sentence(sent_obj)
        ps_nes = _tag_ps(w2v_dic, svm_model, sent)
        sent_obj['NE'] = _merge_ne(sent, ps_nes)
    json.dump(json_obj, sys.stdout, ensure_ascii=False, indent=2)
コード例 #12
0
ファイル: eval.py プロジェクト: songys/annie
def _count(gold, test):
    """
    count gold, test and matched NEs
    :param  gold:  gold standard
    :param  test:  test
    :return:       (gold, test, match) counter triple
    """
    gold_cnt = defaultdict(int)
    test_cnt = defaultdict(int)
    match_cnt = defaultdict(int)
    for gold_sent, test_sent in zip(gold['sentence'], test['sentence']):
        if len(gold_sent['text']) != len(test_sent['text']):
            logging.error('content of sentences are different:')
            logging.error('\tgold: %s', gold_sent['text'])
            logging.error('\ttest: %s', test_sent['text'])
            sys.exit(2)
        gold_nes = set(
            [NE(_['begin'], _['end'], _['type']) for _ in gold_sent['NE']])
        # gold_cnt.update([_.cate for _ in gold_nes])    # Counter only in 2.7
        for entity in gold_nes:
            gold_cnt[entity.cate] += 1
        test_nes = set(
            [NE(_['begin'], _['end'], _['type']) for _ in test_sent['NE']])
        # test_cnt.update([_.cate for _ in test_nes])    # Counter only in 2.7
        for entity in test_nes:
            test_cnt[entity.cate] += 1
        match_nes = gold_nes & test_nes
        # match_cnt.update([_.cate for _ in match_nes])    # Counter only in 2.7
        for entity in match_nes:
            match_cnt[entity.cate] += 1
        if ERR_CATE:
            gold_only_nes = set(
                [_ for _ in (gold_nes - match_nes) if _.cate in ERR_CATE])
            test_only_nes = set(
                [_ for _ in (test_nes - match_nes) if _.cate in ERR_CATE])
            if gold_only_nes or test_only_nes:
                sent = sentence.Sentence(gold_sent)
                print(sent.to_dbg_str(), file=sys.stderr)
            for ett in sorted(list(gold_only_nes)):
                print('\t[G] (%s) %s' %
                      (ett.cate, _morp_dbg_str(sent, ett.begin, ett.end)),
                      file=sys.stderr)
            for ett in sorted(list(test_only_nes)):
                print('\t[T] (%s) %s' %
                      (ett.cate, _morp_dbg_str(sent, ett.begin, ett.end)),
                      file=sys.stderr)
    return gold_cnt, test_cnt, match_cnt
コード例 #13
0
 def get_sentences(paragraphs):
     """Get tokenized sentences within each paragraph from a list of
     paragraphs where each paragraph is a string or a list of sentences.
     """
     # Note that this generator yields
     # paragraph = [sent1, sent2, ...] and sent = [token1, token2, ...]
     offset = 0
     for p, paragraph in enumerate(paragraphs):
         sents = split(paragraph) if isinstance(paragraph, str) \
             else paragraph
         yield [
             sentence.Sentence(raw=sent,
                               sentid=(offset + s),
                               rel_id=s,
                               par_id=p) for s, sent in enumerate(sents)
         ]
         offset += len(sents)
コード例 #14
0
 def __check_top_results(masked_sentence, mistake_position, ngrams,
                         top_results):
     i = 0
     while i != len(top_results["token_strs"]):
         suggestion_sentence = masked_sentence.replace(
             "[MASK]", top_results["token_strs"][i])
         suggestion_ngrams = ngram_model.Ngrams(
             sentence.Sentence(suggestion_sentence), ngrams.language)
         suggestion_mistake_positions = error_checker.get_mistake_positons(
             suggestion_ngrams)
         if mistake_position in suggestion_mistake_positions:
             del top_results["token_strs"][i]
             del top_results["scores"][i]
             del top_results["ranks"][i]
         else:
             i += 1
     return top_results
コード例 #15
0
    def __init__(self, treebank=None, training_corpus=None):

        treebank = self._get_treebank(treebank)
        if training_corpus == None:
            training_corpus = treebank
        self.training_corpus = training_corpus

        S, Gold = [], []
        #for s in treebank.sents():
        for s in treebank.tagged_sents():
            s = [x[1] for x in s]
            S += [sentence.Sentence(s)]

        for t in treebank.parsed_sents():
            Gold += [bracketing.tree_to_bracketing(t)]

        self.S = S
        self.Gold = Gold
コード例 #16
0
def main():
    """
    convert from CRFsuite IOB tagged to JSON
    """
    json_obj = json.load(codecs.open(_OPTS.json, 'rt', encoding='UTF-8'))
    iobs = _load_iob_sentences(sys.stdin)
    if len(json_obj['sentence']) != len(iobs):
        logging.error('# of sentences are different %d vs %d', len(json_obj['sentence']), len(iobs))
        sys.exit(1)

    for sent_obj, iob in zip(json_obj['sentence'], iobs):
        sent = sentence.Sentence(sent_obj)
        if len(sent.morps) != len(iob):
            logging.error('morpheme lengths in sentence are different:')
            logging.error('\tjson: %s', len(sent.morps))
            logging.error('\tiob : %s', len(iob))
            sys.exit(2)
        sent_obj['NE'] = _make_nes(sent, iob)
    json.dump(json_obj, sys.stdout, ensure_ascii=False, indent=2)
コード例 #17
0
ファイル: filterabstracts.py プロジェクト: olabknbit/acres
def keepForDiabetesCorpusCostValue(xmldoc):
    """ Return True if we should keep this abstract for the diabetes corpus
        Include abstract in diabetes corpus if it contains at least *one* currency value.
    """
    textNodeList = xmldoc.getElementsByTagName('AbstractText')
    nCostValues = 0
    for textNode in textNodeList:
        text = xmlutil.getText(textNode)
        sentenceList = sentenceSplitter.tokenize(text)
        for sText in sentenceList:
            tokenTextList = tokenizer.tokenize(sText)
            tokenList = tokenlist.TokenList()
            tokenList.convertStringList(tokenTextList)
            s = sentence.Sentence(tokenList)
            for token in s:
                lemmatizeabstracts.lemmatizeToken(token)
                if cvFinder.tokenIsCostValue(token):
                    nCostValues += 1

    return nCostValues > 0
コード例 #18
0
ファイル: model.py プロジェクト: speedcell4/dmvccm
    def __init__(self, treebank=None, training_corpus=None):
        """
        The elements of the treebank must be trees with a DepSet in the
        attribute depset.
        """
        treebank = self._get_treebank(treebank)
        if training_corpus is None:
            training_corpus = treebank
        self.test_corpus = treebank
        self.training_corpus = training_corpus
        S = []
        for s in treebank.tagged_sents():
            s = [x[1] for x in s]
            S += [sentence.Sentence(s)]
        self.S = S
        # Extract gold as DepSets:
        # FIXME: call super and do this there.
        self.Gold = [t.depset for t in treebank.parsed_sents()]

        # Extract gold as Bracketings:
        self.bracketing_model = model.BracketingModel(treebank)
コード例 #19
0
    def test(self, sentences):
        """
        Inference on test sentences
        input: test sentences
        """
        test_sentences = []
        inferred_trees = []
        total = 0
        for s in sentences:
            test_sentences += [sentence.Sentence(s.words, s.pos_tags)]

        for s in test_sentences:
            trees = s.get_trees()
            i = 0
            no_construction = False
            while (len(trees) > 0):
                if i == (len(trees) - 1):
                    if no_construction == True:
                        break
                    # if we reach the end start from the beginning
                    no_construction = True
                    i = 0
                else:
                    # extract features
                    extracted_features = self.extract_test_features(
                        trees, i, LEFT_CONTEXT, RIGHT_CONTEXT, self.N_FEATURES)

                    # estimate the action to be taken for i, i+ 1 target  nodes
                    y = self.estimate_action(trees, i, extracted_features)
                    i, trees = self.take_action(trees, i, y)
                    # execute the action and modify the trees
                    if y != SHIFT:
                        no_construction = False
                    self.test_actions[y] += 1
            if (len(trees) == 1):
                total += 1
                # print total
            inferred_trees += [trees]

        return inferred_trees
コード例 #20
0
ファイル: instance.py プロジェクト: wanlinxie/dissertation
    def _process_sentences(self, sentences):
        """Generate a list of Sentence or MultiSentence objects from a variety
        of inputs.
        """
        if len(sentences) == 0:
            self.sentences = sentences
            return

        # NOTE: We base all checks on the first tuple in order to ensure
        # consistent (and more efficient) processing.

        # If we already have Sentence-like objects, do nothing
        if isinstance(sentences[0], sentence.Sentence) or \
                isinstance(sentences[0], sentence.MultiSentence):
            self.sentences = sentences

        # If we have a list of strings, assume that each element represents a
        # separate sentence (we expect a list of sentences as input)
        elif isinstance(sentences[0], basestring):
            self.sentences = [sentence.Sentence(s) for s in sentences]

        # If we have a list of lists/tuples, look deeper
        elif isinstance(sentences[0], list) or isinstance(sentences[0], tuple):

            # If it's a list/tuple of strings, check the last string.
            if isinstance(sentences[0][0], basestring):

                # If the last string is sentence-terminating punctuation
                # or doesn't feature a space, assume each string represents
                # a word, and therefore each list/tuple represents a
                # single sentence.
                if sentences[0][-1] in ('.', '?', '!', '\"', '\'') or \
                        (len(sentences[0]) > 1 and \
                        ' ' not in sentences[0][-1]):
                    self.sentences = [sentence.Sentence(s) for s in sentences]

                # Otherwise, assume that each string represents a full
                # sentence and therefore each list/tuple represents a group of
                # multiple connected sentences.
                else:
                    self.sentences = [
                        sentence.MultiSentence(map(sentence.Sentence, ms))
                        for ms in sentences
                    ]

            # If it's a list/tuple of lists/tuples, just assume that they each
            # contain strings representing words. The inner lists should
            # represent sentences while the outer lists should represent
            # groups of multiple connected sentences.
            elif isinstance(sentences[0][0], list) or \
                    isinstance(sentences[0][0], tuple):
                self.sentences = [
                    sentence.MultiSentence(map(sentence.Sentence, ms))
                    for ms in sentences
                ]
            else:
                print "ERROR: unknown type", str(sentences[0].__class__)
                print "Expected Sentence-like objects or lists of strings",
                print "convertible to Sentence-like objects"
                raise TypeError
        else:
            print "ERROR: unknown type", str(sentences[0].__class__)
            print "Expected Sentence-like objects or lists of strings",
            print "convertible to Sentence-like objects"
            raise TypeError
コード例 #21
0
ファイル: main.py プロジェクト: ricci5791/python-education
"""some"""
import sentence

sen = sentence.Sentence("Hello world.")

print("Lazy iterator")
print(sen._words())
print(next(sen._words()))

print("\nFor loop:")
for i in sen:
    print(i)

print("\nSentence.words: ")
print(sen.words)
print("\nSentence.chars_count: ")
print(sen.chars_count)
print("\nSentence.other_chars: ")
print(sen.other_chars)
print("\nSentence[0]: ")
print(sen[0])
print("\nSentence[:]: ")
print(sen[:])

gen = iter(sen)

for i in gen:
    print(i)

print("\nUsing next after using generator in loop:")
print(next(gen))
コード例 #22
0
def read_sentences_normalize_ne(stanford_file_name):
    stanford_file = codecs.open(stanford_file_name, 'r', 'utf-8')

    sentences = []
    tokens = []

    token_alignments = []
    text_line = ''

    state = False
    ne_state = False
    money_state = False
    percent_state = False
    number_state = False
    ordinal_state = False
    time_state = False
    date_state = False
    duration_state = False
    set_state = False
    last_ne_tag = ''
    token_counter = 0

    date_re = re.compile(r'^(\d\d\d\d|XXXX)-(\d\d|XX)-(\d\d|XX)$')
    date2_re = re.compile(r'^(\d\d\d\d|XXXX)-(\d\d|XX)$')
    date3_re = re.compile(r'^(\d\d\d\d|XXXX)$')

    for line in stanford_file:
        if line.startswith('Sentence #'):
            if state:
                sentences.append(asent.Sentence(tokens, token_alignments))
                tokens = []
                token_alignments = []
                state = False
                ne_state = False
                money_state = False
                percent_state = False
                number_state = False
                ordinal_state = False
                time_state = False
                date_state = False
                duration_state = False
                set_state = False
                last_ne_tag = ''
                token_counter = 0
        elif line.startswith('[Text=') and line[-2] == ']':
            token = asent.Token.parse_stanford_line(line[1:-2], {})
            #For LOCATION, PERSON, ORGANIZATION, MISC.
            if ne_state and not (token.is_ne and token.ne_tag == last_ne_tag):
                ne_state = False
            if not ne_state and token.is_ne and token.ne_tag in \
                ['LOCATION', 'PERSON', 'ORGANIZATION', 'MISC']:
                ne_state = True
                # Appends to the front.
                last_ne_tag = token.ne_tag
                token.constant_label = 'name'
                token.const_lexeme = token.word
            # For MONEY:
            if money_state and not (token.is_ne and token.ne_tag == 'MONEY'):
                money_state = False
            elif not money_state and token.is_ne and token.ne_tag == 'MONEY':
                money_state = True
                money_str = token.normalized_ne_tag
                if len(money_str) == 0:
                    # Not treated as money.
                    token.is_ne = False
                    token.ne_tag = ''
                    money_state = False
                elif len(money_str) > 1:  # length 1 is for units
                    unit_ind = 1 if money_str[0] in ['>', '<', '~'] else 0
                    if money_str[1] == '=':
                        unit_ind = 2
                    token.const_lexeme = convert_number(money_str, True)
            # Percentage.
            if percent_state and not (token.is_ne
                                      and token.ne_tag == 'PERCENT'):
                percent_state = False
            elif not percent_state and token.is_ne and token.ne_tag == 'PERCENT':
                percent_state = True
                percent_str = token.normalized_ne_tag
                if len(percent_str) > 1:
                    token.normalized_ne_tag = convert_number(percent_str, True)
            if number_state and not (token.is_ne and token.ne_tag == 'NUMBER'):
                number_state = False
            elif not number_state and token.is_ne and token.ne_tag == 'NUMBER':
                number_state = True
                number_str = token.normalized_ne_tag
                if len(number_str) == 0:
                    number_state = False
                    token.is_ne = False
                    token.ne_tag = ''
                else:
                    token.const_lexeme = convert_number(number_str, False)
            if ordinal_state and not (token.is_ne
                                      and token.ne_tag == 'ORDINAL'):
                ordinal_state = False
            elif not ordinal_state and token.is_ne and token.ne_tag == 'ORDINAL':
                ordinal_state = True
                number_str = token.normalized_ne_tag
                if len(number_str) == 0:
                    number_state = False
                    token.is_ne = False
                    token.ne_tag = ''
                else:
                    token.const_lexeme = convert_number(number_str, False)
            if time_state and not (token.is_timex
                                   and token.ne_tag in ['DATE', 'TIME']):
                time_state = False
            elif not time_state and (token.is_timex
                                     and token.ne_tag in ['DATE', 'TIME']):
                # The same date and time expression and contain both DATE and TIME.
                time_state = True
            if time_state and not date_state and token.ne_tag == 'DATE':
                # Only match pure date expressions
                # - cannot convert compound expressions cleanly enough.
                date_str = token.normalized_ne_tag
                if len(date_str.split()) == 1:
                    # Strip time from string.
                    if 'T' in date_str:
                        date_str = date_str[:date_str.index('T')]
                    if re.match(r'^\d\d\dX$', date_str):
                        date_str = date_str[:3] + '0'
                    if re.match(r'^\d\dXX$', date_str):
                        date_str = date_str[:2] + '00'
                    m = date_re.match(date_str)
                    m2 = date2_re.match(date_str)
                    m3 = date3_re.match(date_str)
                    if m or m2 or m3:
                        date_state = True
                        if m:
                            date_list = list(m.groups())
                        elif m2:
                            date_list = list(m2.groups())
                        elif m3:
                            date_list = list(m3.groups())
                        date_list = filter(lambda d: 'X' not in d, date_list)
                        date_list = [
                            convert_number(date, False) for date in date_list
                        ]
                        if date_list:
                            token.const_lexeme = date_list[0]
                    #else don't handle as a date.
            if date_state and token.ne_tag <> 'DATE':
                date_state = False
            # For Duration:
            if duration_state and not (token.is_timex
                                       and token.ne_tag == 'DURATION'):
                duration_state = False
            elif not duration_state and token.is_timex and token.ne_tag == 'DURATION':
                duration_state = True
                time_str = token.normalized_ne_tag
                period, unit = convert_period(time_str)
                if period == 0:
                    duration_state = False
                else:
                    token.const_lexeme = str(period)
                    token.ne_tag += '_' + unit
            # For SET:
            if set_state and not (token.is_timex and token.ne_tag == 'SET'):
                set_state = False
            elif not set_state and token.is_timex and token.ne_tag == 'SET':
                set_state = True
                freq = 1
                period = 0
                unit = ''
                if token.timex_attr.has_key('freq'):
                    rate_re = re.compile(r'P(\d\d*)([A-Z])')
                    freq_m = rate_re.match(token.timex_attr['freq'])
                    freq = int(freq_m.group(1))
                if token.timex_attr.has_key('periodicity'):
                    period, unit = convert_period(
                        token.timex_attr['periodicity'])
                if period == 0:
                    set_state = False
                    token.ne_tag = ''
                else:
                    if freq > 1:
                        token_ne_tag += '_rate'
                    token.const_lexeme = str(period)
                    token.ne_tag += '_temporal_' + unit
            # Identify numbers:
            if re.match(r'^[+-]?\d+(\.\d+)?$', token.word):
                if token.const_lexeme == '':
                    token.const_lexeme = convert_number(token.word, False)
                token.constant_label = 'number'
            token.pred_lexeme = token.word
            tokens.append(token)
            state = True
    if state:
        sentences.append(asent.Sentence(tokens))
    return sentences
コード例 #23
0
def train(args: Dict):
    MAX_LEN = int(args['--max-len'])
    bs = int(args['--batch-size'])
    model_root = args['--model-root'] if args['--model-root'] else './models'

    dataLoader= sentence.Sentence(args['--train-src'])

    device = torch.device("cuda:0" if args['--cuda'] else "cpu")
    print('use device: %s' % device, file=sys.stderr)
    if args['--cuda']:
        n_gpu = torch.cuda.device_count()
        torch.cuda.get_device_name(0)

    tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', do_lower_case=False)

    tokenized_texts = [tokenizer.tokenize(sent) for sent in dataLoader.sentences]

    print(dataLoader.sentences[0])
    print(tokenized_texts[0])

    input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
                              maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

    tags = pad_sequences([[dataLoader.tag2idx.get(l) for l in lab] for lab in dataLoader.labels],
                         maxlen=MAX_LEN, value=dataLoader.tag2idx["O"], padding="post",
                         dtype="long", truncating="post")

    attention_masks = [[float(i > 0) for i in ii] for ii in input_ids]

    """
    The BERT Model requires us to have a [SEP] token at the end of each sentence as a part of its preprocessing. 102 is the index BERT recognizes as the index of [SEP]. Hence, I am adding it to the end of the sentence after padding/truncating
    (as it might have been removed if the sequences were greater than 75 in length) to be compatible with BERT's requirement. I didn't have it in the beginning and I thought it would be the reason for the poor results but changing it didn't help and I chose to keep it anyways as it felt right. :)
    """
    for i, inp in enumerate(input_ids):
        if (102 not in inp):
            inp[-1] = 102
            tags[i][-1] = dataLoader.tag2idx.get("O")

    tts = float(args['--train-test-split'])

    tr_inputs, val_inputs, tr_tags, val_tags = train_test_split(input_ids, tags,
                                                                random_state=10, test_size=tts)
    tr_masks, val_masks, _, _ = train_test_split(attention_masks, input_ids,
                                                 random_state=10, test_size=tts)

    tr_inputs = torch.tensor(tr_inputs).to(torch.int64)
    val_inputs = torch.tensor(val_inputs).to(torch.int64)
    tr_tags = torch.tensor(tr_tags).to(torch.int64)
    val_tags = torch.tensor(val_tags).to(torch.int64)
    tr_masks = torch.tensor(tr_masks)
    val_masks = torch.tensor(val_masks)

    train_data = TensorDataset(tr_inputs, tr_masks, tr_tags)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=bs)

    valid_data = TensorDataset(val_inputs, val_masks, val_tags)
    valid_sampler = SequentialSampler(valid_data)
    valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=bs)

    model = BertForTokenClassification.from_pretrained(
        "bert-base-multilingual-cased", num_labels=len(dataLoader.tag2idx))

    if args['--cuda']:
        model.cuda()

    FULL_FINETUNING = True if args['--full-finetuning'] else False
    if FULL_FINETUNING:
        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'gamma', 'beta']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
             'weight_decay_rate': 0.01},
            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
             'weight_decay_rate': 0.0}
        ]
    else:
        param_optimizer = list(model.classifier.named_parameters())
        optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]

    optimizer = Adam(optimizer_grouped_parameters, lr=float(args['--lr']))

    epochs = int(args['--max-epoch'])
    max_grad_norm = 1.0
    hist_valid_scores = []

    for _ in trange(epochs, desc="Epoch"):
        # TRAIN loop
        model.train()
        tr_loss = 0
        nb_tr_examples, nb_tr_steps = 0, 0
        for step, batch in enumerate(train_dataloader):
            # add batch to gpu
            batch = tuple(t.to(device) for t in batch)
            b_input_ids, b_input_mask, b_labels = batch
            # forward pass
            loss = model(b_input_ids, token_type_ids=None,
                         attention_mask=b_input_mask, labels=b_labels)
            # backward pass
            loss.backward()
            # track train loss
            tr_loss += loss.item()
            nb_tr_examples += b_input_ids.size(0)
            nb_tr_steps += 1
            # gradient clipping
            torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)
            # update parameters
            optimizer.step()
            model.zero_grad()
        # print train loss per epoch
        print("Train loss: {}".format(tr_loss / nb_tr_steps))
        # VALIDATION on validation set
        model.eval()
        eval_loss, eval_accuracy = 0, 0
        nb_eval_steps, nb_eval_examples = 0, 0
        predictions, true_labels = [], []
        for batch in valid_dataloader:
            batch = tuple(t.to(device) for t in batch)
            b_input_ids, b_input_mask, b_labels = batch

            with torch.no_grad():
                tmp_eval_loss = model(b_input_ids, token_type_ids=None,
                                      attention_mask=b_input_mask, labels=b_labels)
                logits = model(b_input_ids, token_type_ids=None,
                               attention_mask=b_input_mask)
            logits = logits.detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()
            predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
            true_labels.append(label_ids)

            tmp_eval_accuracy = flat_accuracy(logits, label_ids)

            eval_loss += tmp_eval_loss.mean().item()
            eval_accuracy += tmp_eval_accuracy

            nb_eval_examples += b_input_ids.size(0)
            nb_eval_steps += 1
        eval_loss = eval_loss / nb_eval_steps
        print("Validation loss: {}".format(eval_loss))
        print("Validation Accuracy: {}".format(eval_accuracy / nb_eval_steps))
        pred_tags = [dataLoader.tags_vals[p_i] for p in predictions for p_i in p]
        valid_tags = [dataLoader.tags_vals[l_ii] for l in true_labels for l_i in l for l_ii in l_i]
        f1=f1_score(valid_tags,pred_tags)
        print("F1-Score: {}".format(f1))

        is_better = len(hist_valid_scores) == 0 or f1 > max(hist_valid_scores)
        hist_valid_scores.append(f1)
        if is_better:
            output_model_file = os.path.join(model_root, "model_file.bin")
            output_config_file = os.path.join(model_root, "config_file.bin")
            output_vocab_file = model_root

            model_to_save = model.module if hasattr(model, 'module') else model
            torch.save(model_to_save.state_dict(), output_model_file)
            model_to_save.config.to_json_file(output_config_file)
            tokenizer.save_vocabulary(output_vocab_file)

    print('reached maximum number of epochs!', file=sys.stderr)
    exit(0)
コード例 #24
0
def evaluate(args:Dict):
    model_root = args['--model-root'] if args['--model-root'] else './models'
    print("load model from {}".format(model_root), file=sys.stderr)

    dataLoader = sentence.Sentence(args['--test-src'])

    device = torch.device("cuda:0" if args['--cuda'] else "cpu")

    output_model_file = os.path.join(model_root, "model_file.bin")
    output_config_file = os.path.join(model_root, "config_file.bin")
    output_vocab_file = os.path.join(model_root, "vocab.txt")
    config = BertConfig.from_json_file(output_config_file)
    model = BertForTokenClassification(config,num_labels=len(dataLoader.tag2idx))
    state_dict = torch.load(output_model_file)
    model.load_state_dict(state_dict)
    tokenizer = BertTokenizer(output_vocab_file, do_lower_case=False)

    tokenized_texts = [tokenizer.tokenize(sent) for sent in dataLoader.sentences]

    if args['--cuda']:
        model = model.to(torch.device("cuda:0"))

    MAX_LEN = int(args['--max-len'])

    input_ids_test = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
                              maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

    tags_test = pad_sequences([[dataLoader.tag2idx.get(l) for l in lab] for lab in dataLoader.labels],
                         maxlen=MAX_LEN, value=dataLoader.tag2idx["O"], padding="post",
                         dtype="long", truncating="post")

    attention_masks_test = [[float(i > 0) for i in ii] for ii in input_ids_test]

    for i, inp in enumerate(input_ids_test):
        if (102 not in inp):
            inp[-1] = 102
            tags_test[i][-1] = dataLoader.tag2idx.get("O")

    te_inputs = torch.tensor(input_ids_test).to(torch.int64)
    te_tags = torch.tensor(tags_test).to(torch.int64)
    te_masks = torch.tensor(attention_masks_test)

    test_data = TensorDataset(te_inputs, te_masks, te_tags)
    test_sampler = SequentialSampler(test_data)
    test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=int(args['--batch-size']))

    model.eval()
    predictions = []
    true_labels = []
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    for batch in test_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch

        with torch.no_grad():
            tmp_eval_loss = model(b_input_ids, token_type_ids=None,
                                  attention_mask=b_input_mask, labels=b_labels)
            logits = model(b_input_ids, token_type_ids=None,
                           attention_mask=b_input_mask)

        logits = logits.detach().cpu().numpy()
        predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
        label_ids = b_labels.to('cpu').numpy()
        true_labels.append(label_ids)
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)

        eval_loss += tmp_eval_loss.mean().item()
        eval_accuracy += tmp_eval_accuracy

        nb_eval_examples += b_input_ids.size(0)
        nb_eval_steps += 1

    pred_tags = [[dataLoader.tags_vals[p_i] for p_i in p] for p in predictions]
    test_tags = [[dataLoader.tags_vals[l_ii] for l_ii in l_i] for l in true_labels for l_i in l]

    tags_test_fin = list()
    for l in tags_test:
        temp_tag = list()
        for l_i in l:
            temp_tag.append(dataLoader.tags_vals[l_i])
        tags_test_fin.append(temp_tag)

    print("Test loss: {}".format(eval_loss / nb_eval_steps))
    print("Test Accuracy: {}".format(eval_accuracy / nb_eval_steps))
    print("Test F1-Score: {}".format(f1_score(tags_test_fin, pred_tags)))

    print(classification_report(tags_test_fin, pred_tags))

    print("Number of Test sentences: ", len(tags_test_fin))
コード例 #25
0
ファイル: alignment.py プロジェクト: jrgillick/Applause
 def get_sentence_list(self):
     sentences = [sent for sent in nlp(self.transcript).sents]
     return [
         sentence.Sentence(self, s.start_char, s.end_char)
         for s in sentences
     ]
コード例 #26
0
def read_sentences(stanford_file_name, file_id):
    stanford_file = codecs.open(stanford_file_name, 'r', 'utf-8')

    sentences = []
    raw_sentences = []
    tokens = []

    text_line = ''
    state_line = ''
    sent_offset = 0
    state = False
    state1 = False

    for line in stanford_file:
        if line.startswith('Sentence #'):
            if state:
                sentences.append(asent.Sentence(tokens))
                sentences[-1].offset = sent_offset
                sentences[-1].raw_txt = text_line
                sentences[-1].file_id = file_id
                text_line = ''
                state_line = ''
                tokens = []
                state = False
                state1 = False
        elif len(line) > 1 and line[-2] == ']' and (state or
                                                    line.startswith('[Text=')):
            if state_line:
                token = asent.Token.parse_stanford_line(
                    state_line + ' ' + line[:-2], {})
            else:
                token = asent.Token.parse_stanford_line(line[1:-2], {})
            if not state1:
                sent_offset = token.char_start
            ind_start = token.char_start - sent_offset
            ind_end = token.char_end - sent_offset
            token.reset_char_spans(ind_start, ind_end)

            word = token.original_word
            word = word.replace(u"\u00A0", "_")
            if '_' in word:
                split_word = word.split('_')
                split_inds = filter(lambda x: word[x] == '_', range(len(word)))
                first_word = word[:split_inds[0]]
                token.original_word = first_word
                token.word = first_word
                if normalize_ne:
                    token.pred_lexeme = first_word.lower()
                else:
                    token.pred_lexeme = first_word.lower(
                    ) + u'/' + token.pos.lower()
                token.const_lexeme = first_word
                token.char_end = token.char_start + split_inds[0]
                tokens.append(token)
                for j, w in enumerate(split_word[1:]):
                    char_start = token.char_start + split_inds[j] + 1
                    if j + 1 < len(split_inds):
                        char_end = token.char_start + split_inds[j + 1]
                    else:
                        char_end = token.char_start + len(word)
                    new_token = asent.Token(w,
                                            w,
                                            token.pos,
                                            token.constant_label,
                                            token.is_ne,
                                            token.is_timex,
                                            token.ne_tag,
                                            token.normalized_ne_tag,
                                            char_start=char_start,
                                            char_end=char_end)
                    tokens.append(new_token)
            else:
                tokens.append(token)
            state = True
            state1 = True
        elif line.startswith('[Text='):
            state_line = line[1:].strip()
            state = True
        else:  #if line.strip():
            if state:
                state_line += ' ' + line.strip()
            else:
                text_line += line.replace('\n', ' ')
    if state:
        sentences.append(asent.Sentence(tokens))
        sentences[-1].offset = sent_offset
        sentences[-1].raw_txt = text_line
        sentences[-1].file_id = file_id
    return sentences
コード例 #27
0
 def __init__(self, user_input, language_string):
     self.sentence = sentence.Sentence(user_input)
     self.ngrams = ngram_model.Ngrams(self.sentence,
                                      Language(language_string))
     self.mistake_positions = error_checker.get_mistake_positons(
         self.ngrams)
コード例 #28
0
ファイル: preprocess.py プロジェクト: vt257/allnews-am

def writeInfile(data, filename):
    tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-cased')
    subword_len_counter = 0
    with open(filename, 'wt', encoding='utf-8') as f:
        for sentence in data:
            for (token, key) in sentence:
                current_subwords_len = len(tokenizer.tokenize(token))
                if current_subwords_len == 0:
                    continue
                if (subword_len_counter + current_subwords_len) > 512:
                    f.write("\n")
                    f.write((token + ' ' + key + '\n'))
                    subword_len_counter = 0
                    continue
                subword_len_counter += current_subwords_len
                f.write((token + ' ' + key + '\n'))
            f.write('\n')


if __name__ == '__main__':
    traindevgetter = sentence.Sentence('pioner-silver/train.conll03')
    testgetter = sentence.Sentence('pioner-silver/dev.conll03')

    train, dev = partitionRankings(traindevgetter.tagged_sentences, 0.1)

    writeInfile(list(train), 'data/train.txt')
    writeInfile(list(dev), 'data/dev.txt')
    writeInfile(list(testgetter.tagged_sentences), 'data/test.txt')
コード例 #29
0
def genParagraph():
    """generate paragraph"""
    num_sentences = random.randrange(10, 30)
    sentences = [(" ".join(sentence.Sentence())).capitalize() + "." for i in range(num_sentences)]
    return "<p>{0}</p>".format(" ".join(sentences))