Beispiel #1
0
def extract_features(wordseg, ql, qr):
    sent_l = Sentence()
    sent_l.raw_form = ql
    sent_l.base_form = ql
    wordseg_out = wordseg.segment(ql)
    sent_l.basic_words = wordseg_out.basic_words

    sent_r = Sentence()
    sent_r.raw_form = qr
    sent_r.base_form = qr
    wordseg_out = wordseg.segment(qr)
    sent_r.basic_words = wordseg_out.basic_words

    l_periph = get_periph(sent_l, qr)
    r_periph = get_periph(sent_r, ql)

    feature_dict = {}

    lexical_features = calc_lexical_features(sent_l, sent_r)
    feature_dict.update(lexical_features)

    periph_lexical_features = calc_periph_lexical_features(l_periph, r_periph)
    feature_dict.update(periph_lexical_features)

    return feature_dict
Beispiel #2
0
def extract_features(wordseg, ql, qr, tfidf_count_hash_vectorModels):
    sent_l = Sentence()
    sent_l.raw_form = ql
    sent_l.base_form = ql
    wordseg_out = wordseg.segment(ql) 
    sent_l.basic_words = wordseg_out.basic_words
    
    feature_dict = {}
    sent_r = Sentence()
    sent_r.raw_form = qr
    sent_r.base_form = qr
    wordseg_out = wordseg.segment(qr) 
    sent_r.basic_words = wordseg_out.basic_words
    
    l_notion = get_notional_tokens(sent_l)
    r_notion = get_notional_tokens(sent_r)
    
    count_tfidf_hash_features = get_tfidf_count_hash_features(sent_l, sent_r, tfidf_count_hash_vectorModels)
    feature_dict.update(count_tfidf_hash_features)
    
    notion_count_tfidf_hash_features = get_tfidf_count_hash_features(l_notion, r_notion, tfidf_count_hash_vectorModels, "notion")
    feature_dict.update(notion_count_tfidf_hash_features)
    
    for k in feature_dict:
        print(k)
    
    return feature_dict
Beispiel #3
0
def extract_features(wordseg, ql, qr, sent_word2vec, word_weights, sent_model):
    sent_l = Sentence()
    sent_l.raw_form = ql
    sent_l.base_form = ql
    wordseg_out = wordseg.segment(ql)
    sent_l.basic_words = wordseg_out.basic_words

    sent_r = Sentence()
    sent_r.raw_form = qr
    sent_r.base_form = qr
    wordseg_out = wordseg.segment(qr)
    sent_r.basic_words = wordseg_out.basic_words

    l_notion = get_notional_tokens(sent_l)
    r_notion = get_notional_tokens(sent_r)

    feature_dict = {}

    sentvec_features = get_sentvec_features(sent_word2vec, word_weights,
                                            sent_model, sent_l, sent_r)
    feature_dict.update(sentvec_features)
    sentvec_features = get_sentvec_features(sent_word2vec, word_weights,
                                            sent_model, sent_l, sent_r,
                                            "notion")
    feature_dict.update(sentvec_features)

    for k, value in feature_dict.items():
        print(k)
        print(value)

    return feature_dict
Beispiel #4
0
def extract_features(wordseg, ql, qr, tfidf_count_hash_vectorModels,
                     sent_word2vec, sent_vocab_dict, sent_model):

    sent_l = Sentence()
    sent_l.raw_form = ql
    sent_l.base_form = ql
    wordseg_out = wordseg.segment(ql)
    sent_l.basic_words = wordseg_out.basic_words

    feature_dict = {}
    sent_r = Sentence()
    sent_r.raw_form = qr
    sent_r.base_form = qr
    wordseg_out = wordseg.segment(qr)
    sent_r.basic_words = wordseg_out.basic_words

    lexical_features = calc_lexical_features(sent_l, sent_r)
    feature_dict.update(lexical_features)

    count_tfidf_hash_features = get_tfidf_count_hash_features(
        sent_l, sent_r, tfidf_count_hash_vectorModels)
    feature_dict.update(count_tfidf_hash_features)

    sentvec_features = get_sentvec_features(sent_word2vec, sent_vocab_dict,
                                            sent_model, sent_l, sent_r)
    feature_dict.update(sentvec_features)

    return feature_dict
Beispiel #5
0
    def parse_expression(self, scope):
        """Parse an expression.

        Expressions may contain other forms as well, a expression is the
        'other' type of sentence. They must match a known of sentence in
        the scope.

        :param scope: The scope the expression is being parsed within.
        :return: A Sentence."""
        token = next(self._token_stream)
        if isinstance(token, ValueToken):
            return Sentence([token])
        elif not isinstance(token, FirstToken):
            raise ParseError('Cannot begin a sentence with \"' + repr(token) +
                             '\"')
        elif 'Define' == token.text:
            self._token_stream.push_back(token)
            return self.parse_definition(scope)
        node = Sentence([token])
        part_match = scope.new_matcher()
        if not part_match.next(token):
            self._token_stream.push_back(token)
            raise ParseError('Sentence not matched.', node)
        for token in self._token_stream:
            if isinstance(token, FirstToken):
                self._token_stream.push_back(token)
                if part_match.next():
                    node.append(self.parse_expression(scope))
                elif node.ends_with_dot() and part_match.has_end():
                    return node
                else:
                    raise ParseError('Sentence not matched.', node)
            elif isinstance(token, WordToken):
                if part_match.next(token):
                    node.append(token)
                elif node.ends_with_dot() and part_match.has_end():
                    self._token_stream.push_back(token)
                    return node
                else:
                    raise ParseError('Sentence not matched.', node)
            elif isinstance(token, PeriodToken):
                if part_match.has_end():
                    node.append(token)
                    return node
                else:
                    raise ParseError('Sentence not matched.', node)
            elif isinstance(token, ValueToken):
                if part_match.next():
                    node.append(Sentence([token]))
                else:
                    raise ParseError('Sentence not matched.', node)
            else:
                raise ValueError('Unknown Token Kind: {}'.format(type(token)))
        if isinstance(node[-1], Sentence) and part_match.has_end():
            return node
        raise ParseError('Sentence not matched.', node)
Beispiel #6
0
def get_docs_list(conn, courseid, questionid):
    """
    Parameters:
        conn: A mysql connection.
        courseid: String
        questionid: Integer
    Return:
        Integer: number of references
        Two lists: list of textid and list of doc(class Sentence), the first terms of both are -1 and Sentence reference
    """
    # Get reference of certain courseid and questionid.
    try:
        get_ref_sql = "SELECT ref FROM standards WHERE courseid=%s AND questionid=%s"
        get_ref_cur = conn.cursor()
        get_ref_cur.execute(get_ref_sql, (courseid, questionid))
        refs = get_ref_cur.fetchall()
        # ref = get_ref_cur.fetchone()[0]
    except Exception as e:
        print("Error getting reference of courseid", courseid, "questionid", questionid)
        print(traceback.print_exc())
    finally:
        get_ref_cur.close()

    lang = "ch" if courseid == "201英语一" else "en"
    doc_matrix = []
    textids = []
    ref_id = -1
    for ref in refs:
        reference = Sentence(text=ref[0], language=lang)
        reference.preprocess()
        doc_matrix.append(reference)   # add Sentence reference as the first term of doc_matrix
        textids.append(ref_id)   # Use negative numbers as reference textid.
        ref_id -= 1
    # Get all detection text of certain courseid and questionid.
    detections = None
    try:
        get_detection_sql = "SELECT textid, text FROM detection WHERE courseid = %s and questionid = %s"
        get_detection_cur = conn.cursor()
        if get_detection_cur.execute(get_detection_sql, (courseid, questionid)):
            detections = get_detection_cur.fetchall()
        else:
            print("No quesion", questionid, "of", courseid, "in DETECTION DB.")
    except Exception as e:
        print("Error getting text...", traceback.print_exc())
    finally:
        get_detection_cur.close()

    # Add all detections into doc_matrix
    if detections is None:
        return
    for dt in detections:
        textids.append(dt[0])
        cur_ans = Sentence(text=dt[1], language=lang)
        cur_ans.preprocess()
        doc_matrix.append(cur_ans)
    return len(refs), textids, doc_matrix
Beispiel #7
0
    def parse_definition(self, outer_scope):
        """Parse a definition from the incomming tokens.

        This is technically a kind of expression, but there are a few special
        rules that may force it to become seperate. This is temporary as it
        is just the fastest way I can get this to work. I hope.

        'Define Function or variable name. to be Body. .'"""
        token = next(self._token_stream)
        if 'Define' != token.text:
            raise ParseError('Invalid start of definition: ' + str(token))
        node = Sentence(token)
        ptr = outer_scope.new_matcher()
        if not ptr.next(token):
            self._token_stream.push_back(item)
            raise ParseError('Sentence not matched.', node)
        inner_scope = None
        for item in self._token_stream:
            if isinstance(item, FirstToken):
                self._token_stream.push_back(item)
                if ptr.next():
                    if inner_scope is None:
                        signature = self.parse_signature()
                        node.append(signature)
                        inner_scope = outer_scope.new_define_scope(signature)
                    else:
                        node.append(self.parse_expression(inner_scope))
                elif node.ends_with_dot() and ptr.has_end():
                    return node
                else:
                    raise ParseError('Sentence not matched.', node)
            elif isinstance(item, WordToken):
                if ptr.next(item):
                    node.append(item)
                elif node.ends_with_dot() and ptr.has_end():
                    self._token_stream.push_back(item)
                    return node
                else:
                    raise ParseError('Sentence not matched.', node)
            elif isinstance(item, PeriodToken):
                if ptr.has_end():
                    node.append(item)
                    return node
                else:
                    raise ParseError('Sentence not matched.', node)
            elif isinstance(item, ValueToken):
                if ptr.next():
                    node.append(Sentence(item))
                else:
                    raise ParseError('Sentence not matched.', node)
            else:
                raise TypeError('Parser.parse_definition: Unexpected type' +
                                str(type(item)))
        if node.ends_with_dot() and ptr.has_end():
            return node
        raise ParseError('Sentence not matched.', node)
Beispiel #8
0
    def __iter__(self):
        with open(self.fr_file) as fp_fr, open(self.en_file) as fp_en:
            for i, (line_fr, line_en) in enumerate(zip(fp_fr, fp_en)):
                if self.max_line and i > self.max_line:
                    break

                fr = Sentence(line_fr.strip(), 'fr', i, self.name)
                en = Sentence(line_en.strip(), 'en', i, self.name)

                yield fr, en
Beispiel #9
0
 def read_sentence(self, filename):
     # print("________________________________________________________________________________________________read_sentence starts")
     sentence = Sentence()
     for line in open(filename):
         line = line.strip()
         if line:
             sentence.add_token(line)
         elif len(sentence) != 1:
             yield sentence
             sentence = Sentence(
             )  # print("________________________________________________________________________________________________read_sentence ends")
Beispiel #10
0
 def test_ends_with_dot_period(self):
     has_period = Sentence(
         [FirstToken('Short'),
          WordToken('sentence'),
          PeriodToken()])
     self.assertTrue(has_period.ends_with_dot())
     no_period = Sentence([FirstToken('Word')])
     self.assertFalse(no_period.ends_with_dot())
     super_has_period = Sentence([FirstToken('Run'), has_period])
     self.assertTrue(super_has_period.ends_with_dot())
     super_no_period = Sentence([FirstToken('Run'), no_period])
     self.assertFalse(super_no_period.ends_with_dot())
Beispiel #11
0
def get_docs_list(conn, courseid, questionid):
    """
    Parameters:
        conn: A mysql connection.
        courseid: String
        questionid: Integer
    Return:
        Two lists: list of textid and list of doc(class Sentence), the first terms of both are -1 and Sentence reference.
    """
    # Get reference of certain courseid and questionid.
    try:
        get_ref_sql = "SELECT ref FROM standards WHERE courseid=%s AND questionid=%s"
        get_ref_cur = conn.cursor()
        get_ref_cur.execute(get_ref_sql, (courseid, questionid))
        ref = get_ref_cur.fetchone()[0]

    except Exception as e:
        print("Error getting reference of courseid", courseid, "questionid",
              questionid)
        print(traceback.print_exc())
    finally:
        get_ref_cur.close()

    reference = Sentence(text=ref, language="ch")
    reference.preprocess()
    doc_matrix = [reference
                  ]  # add Sentence reference as the first term of doc_matrix
    textids = [-1]  # Use -1 as the referece textid

    # Get all detection text of certain courseid and questionid.
    try:
        get_detection_sql = "SELECT textid, text FROM detection WHERE courseid = %s and questionid = %s"
        get_detection_cur = conn.cursor()
        if get_detection_cur.execute(get_detection_sql,
                                     (courseid, questionid)):
            detections = get_detection_cur.fetchall()
        else:
            detections = None
            print("No quesion", questionid, "of", courseid, "in DETECTION DB.")
    except Exception as e:
        print("Error getting text...", traceback.print_exc())
    finally:
        get_detection_cur.close()

    # Add all detections into doc_matrix
    if detections == None:
        return
    for dt in detections:
        textids.append(dt[0])
        cur_ans = Sentence(text=dt[1], language="ch")
        cur_ans.preprocess()
        doc_matrix.append(cur_ans)
    return textids, doc_matrix
Beispiel #12
0
def extract_features(wordseg, ql, qr, tfidf_count_hash_vectorModels,
                     sent_word2vec, sent_vocab_dict, sent_model, ner_dict,
                     syn_dict):
    sent_l = Sentence()
    sent_l.raw_form = ql
    sent_l.base_form = ql
    wordseg_out = wordseg.segment(ql)
    sent_l.basic_words = wordseg_out.basic_words

    feature_dict = {}
    sent_r = Sentence()
    sent_r.raw_form = qr
    sent_r.base_form = qr
    wordseg_out = wordseg.segment(qr)
    sent_r.basic_words = wordseg_out.basic_words

    l_notion = get_notional_tokens(sent_l)
    r_notion = get_notional_tokens(sent_r)

    l_periph = get_periph(sent_l, qr)
    r_periph = get_periph(sent_r, ql)
    #----------------------------------------------------------------------------------------------------------

    lexical_features = calc_lexical_features(sent_l, sent_r)
    feature_dict.update(lexical_features)
    periph_lexical_features = calc_periph_lexical_features(l_periph, r_periph)
    feature_dict.update(periph_lexical_features)

    mt_features = calc_mt_features(sent_l, sent_r)
    feature_dict.update(mt_features)

    count_tfidf_hash_features = get_tfidf_count_hash_features(
        sent_l, sent_r, tfidf_count_hash_vectorModels)
    feature_dict.update(count_tfidf_hash_features)
    notion_count_tfidf_hash_features = get_tfidf_count_hash_features(
        l_notion, r_notion, tfidf_count_hash_vectorModels, signature="notion")
    feature_dict.update(notion_count_tfidf_hash_features)

    sentvec_features = get_sentvec_features(sent_word2vec, sent_vocab_dict,
                                            sent_model, sent_l, sent_r)
    feature_dict.update(sentvec_features)
    sentvec_features = get_sentvec_features(sent_word2vec,
                                            sent_vocab_dict,
                                            sent_model,
                                            l_notion,
                                            r_notion,
                                            signature="notion")
    feature_dict.update(sentvec_features)

    ner_features = get_ner_features(sent_l, sent_r, ner_dict, syn_dict)
    feature_dict.update(ner_features)

    return feature_dict
Beispiel #13
0
def into_sentence(prefix: list[int], conn_dict: dict[int, tp.Iterable[str]],
                  var_amount: int, var_type: str, sess) -> Sentence:
    s = Sentence([], sess)
    variables = []
    for _ in range(var_amount):
        t = s.generate(var_type)
        s.append(t)
        variables.append(t)

    s = Sentence([], sess)
    _into_sentence(s, prefix, conn_dict, variables)
    return s
Beispiel #14
0
    def parse(self):

        current_drill_section = None
        previous_line = None

        for current in self._labels(self._labelfile_path):

            logging.info(current)

            if not current_drill_section or \
               current_drill_section.get_name() != current.section:

                current_drill_section = DrillSection(current.section)

                self._drill_sections.append(current_drill_section)

            if not previous_line:
                previous_line = current
                continue

            assert previous_line.section == current.section

            if previous_line.drill != current.drill:

                example_sentence = Sentence(previous_line.text,
                                            self._audiofile_path,
                                            previous_line.start_s,
                                            previous_line.end_s)

                current_drill_section.set_example(example_sentence)

                previous_line = current

                continue

            assert previous_line.section == current.section
            assert previous_line.drill == current.drill

            teacher_sentence = Sentence(previous_line.text,
                                        self._audiofile_path,
                                        previous_line.start_s,
                                        previous_line.end_s)

            previous_line = None

            student_sentence = Sentence(current.text, self._audiofile_path,
                                        current.start_s, current.end_s)

            new_drill = Drill(teacher_sentence, student_sentence)

            current_drill_section.add_drill(new_drill)
Beispiel #15
0
 def test_equal_operator(self):
     left = Sentence([
         FirstToken('Height'),
         WordToken('of'),
         WordToken('box'),
         PeriodToken()
     ])
     right = Sentence([
         FirstToken('Height'),
         WordToken('of'),
         WordToken('box'),
         PeriodToken()
     ])
     self.assertTrue(left == right)
Beispiel #16
0
    def setUp(self):
        sentence_1 = Sentence('今すぐ泳ぎなさい', '')
        self.sc_1 = SentenceClassifier(sentence_1)

        sentence_2 = Sentence('魚雷を発射するとよい', '')
        self.sc_2 = SentenceClassifier(sentence_2)

        sentence_3 = Sentence('島風を育ててはいかがでしょう', '')
        self.sc_3 = SentenceClassifier(sentence_3)

        sentence_4 = Sentence('はい、大丈夫です!', '')
        self.sc_4 = SentenceClassifier(sentence_4)

        sentence_5 = Sentence('浴室は使用前に一度、熱湯消毒をしましょう!', '')
        self.sc_5 = SentenceClassifier(sentence_5)
Beispiel #17
0
def make_data_instance(text, index):
    """
    Takes a line of text and creates a CoNLL09Example instance from it.
    """
    """
    tagger = RnnTagger()
    tagger.tag(text.lstrip().rstrip())
    
    tokenized = tagger.tokens
    pos_tagged = tagger.pos_tag

    lemmatized = tagger.lemmas
    """

    tokenized = nltk.tokenize.word_tokenize(text.lstrip().rstrip())
    pos_tagged = [p[1] for p in nltk.pos_tag(tokenized)]
    lemmatized = [
        lemmatizer.lemmatize(tokenized[i]) if not pos_tagged[i].startswith("V")
        else lemmatizer.lemmatize(tokenized[i], pos='v')
        for i in range(len(tokenized))
    ]

    conll_lines = [
        "{}\t{}\t_\t{}\t_\t{}\t{}\t_\t_\t_\t_\t_\t_\t_\tO\n".format(
            i + 1, tokenized[i], lemmatized[i], pos_tagged[i], index)
        for i in range(len(tokenized))
    ]
    elements = [CoNLL09Element(conll_line) for conll_line in conll_lines]

    sentence = Sentence(syn_type=None, elements=elements)

    instance = CoNLL09Example(sentence, elements)

    return instance
Beispiel #18
0
    def summarize(self, document_path):
        sentences = {}
        counter = 0

        with open(document_path, 'r') as f:
            for line in f:
                s = Util.tokenize(line, Summarizer.non_space)
                sentence = []
                for w in s:
                    sentence.append(w)
                    if Summarizer.sentence_terminator.search(w):
                        sent = Sentence(sentence, Summarizer.punctuation, Summarizer.stopwords, Summarizer.stemmer)
                        sentences[sent] = (sent.tfidf(self.tf, self.df, Summarizer.NUM_DOCS), counter)
                        sentence = []
                        counter += 1

        totalWords = 0
        selected = []
        already_included = set()
        # Use the tf-idf score to sort the sentences
        for sent in sorted(sentences, key=lambda x: sentences[x][0], reverse=True):
            if sent not in already_included: # no duplicates
                already_included.add(sent)
                selected.append(sent)
                totalWords += sent.getLength()
                if totalWords > 100:
                    break

        # return the selected sentences in their order of appearance in the document
        return sorted(selected, key=lambda x: sentences[x][1])
Beispiel #19
0
    def load_stringtext(self, textString, format_list, comment_sign):
        lines = textString.splitlines()
        column_list = {}
        for field in format_list:
            if not (field.isdigit()):
                column_list[field] = []

        length = len(format_list)

        for line in lines:
            entity = line.split()
            if len(entity) == length and entity[0] != comment_sign:
                for i in range(length):
                    if not (format_list[i].isdigit()):
                        column_list[format_list[i]].append(
                            str(entity[i].encode('utf-8')))
            else:
                if not (format_list[0].isdigit()
                        ) and column_list[format_list[0]] != []:
                    sent = Sentence(column_list, format_list, self.fgen)
                    self.data_list.append(sent)

                column_list = {}

                for field in format_list:
                    if not (field.isdigit()):
                        column_list[field] = []
Beispiel #20
0
def read_spmrl_conll_file(spmrl_conll_filename):
    """
    Read a SPMRL .conll file and return list of sentences

    The input file is a SPMRL file converted to conll format by the convert_mst.py script
    Legacy code (try to use read_conll_file instead)
    """

    f = codecs.open(spmrl_conll_filename)
    lines = f.readlines()
    f.close()
    sentences = []
    tokens = []
    lemmas = []
    poses = []
    labels = []
    parents = []
    for line in lines:
        if line.strip() == '':
            s = Sentence(tokens, poses, labels, parents)
            s.set_lemmas(lemmas)
            sentences.append(s)
            tokens = []
            lemmas = []
            poses = []
            labels = []
            parents = []
        else:
            splt = line.strip().split()
            tokens.append(splt[1])
            lemmas.append(splt[2])
            poses.append(splt[4])  # use pos and not cpos
            labels.append(splt[7])
            parents.append(int(splt[6]))
    return sentences
Beispiel #21
0
    def parse_signature(self):
        """Take a stream of tokens and create a Signature.

        Signatures have stricter rules than other parts of the language, but
        they are context insensitive and don't have to match definitions.
        This will use tokens from the stream, but may not empty the stream.

        :return: A Sentence reperesenting the sentence."""
        token = next(self._token_stream)
        if not isinstance(token, FirstToken):
            raise ParseError('Invalid start of Signature: ' + str(token))
        node = Sentence([token])
        for token in self._token_stream:
            if isinstance(token, FirstToken):
                self._token_stream.push_back(token)
                node.append(self.parse_signature())
            elif isinstance(token, WordToken):
                node.append(token)
            elif isinstance(token, PeriodToken):
                node.append(token)
                return node
            elif isinstance(token, ValueToken):
                raise ParseError('Parser.parse_signature: ValueToken not '
                                 'allowed in signature.')
            else:
                raise ValueError('Unknown Token Kind: {}'.format(type(token)))
        raise ParseError('Parser.parse_signature: fell out of the loop.')
Beispiel #22
0
 def sentence_tokenize(self, review_text):
     print review_text
     print "\n\n\n\n\n"
     return [
         Sentence(sent, review=self)
         for sent in Review.SENT_TOKENIZER.tokenize(review_text)
     ]
    def train(self, train_path, ftype):
        """Train the NER model.

        Args:
            train_path: str - The path of training set.
            ftype: str - Indicating the feature type.
        """
        self._nerdic = NERDic(train_path)
        io = self._io
        sentences = []

        # reading the training set.
        for words, poss, labels in io.read_sentences(train_path):
            sentences.append(Sentence(labels, words, poss, self._nerdic))

        feats, labels = self._prepare_feats(sentences, ftype)

        sep_labels = []
        for i in sentence.REVERSE_LABELS.keys():
            sep_labels.append([])
            for label in labels:
                if label == i:
                    sep_labels[i].append(1)
                else:
                    sep_labels[i].append(0)

        print('Start first phase training...')
        for i, learner in enumerate(self._learners):
            learner.train(feats, sep_labels[i])
Beispiel #24
0
def form_sentence(lines,
                  word_alpha,
                  char_alpha,
                  tag_alpha,
                  symbolic_root=False,
                  symbolic_end=False):
    words = []
    word_ids = []
    seq_chars = []
    seq_char_ids = []
    tags = []
    tag_ids = []
    edu_ids = []

    if symbolic_root:
        words.append(ROOT)
        word_ids.append(word_alpha.get_index(ROOT))
        seq_chars.append([
            ROOT_CHAR,
        ])
        seq_char_ids.append([
            char_alpha.get_index(ROOT_CHAR),
        ])
        tags.append(ROOT_POS)
        tag_ids.append(tag_alpha.get_index(ROOT_POS))

    for line in lines:
        chars = []
        char_ids = []
        data = line.strip().split('\t')
        word = DIGIT_RE.sub(b"0", data[2])
        word_id = word_alpha.get_index(word)
        for c in words:
            chars.append(c)
            char_ids.append(char_alpha.get_index(c))
        tag = '$' if data[4] == '#' else data[4]
        tag_id = tag_alpha.get_index(tag)
        edu_id = int(data[9])

        words.append(word)
        word_ids.append(word_id)
        seq_chars.append(chars)
        seq_char_ids.append(char_ids)
        tags.append(tag)
        tag_ids.append(tag_id)
        edu_ids.append(edu_id)

    if symbolic_end:
        words.append(END)
        word_ids.append(word_alpha.get_index(END))
        seq_chars.append([
            END_CHAR,
        ])
        seq_char_ids.append([
            char_alpha.get_index(END_CHAR),
        ])
        tags.append(END_POS)
        tag_ids.append(tag_alpha.get_index(END_POS))
    return Sentence(words, seq_chars, tags, word_ids, seq_char_ids, tag_ids,
                    edu_ids)
Beispiel #25
0
def make_data_instance(text, index, get_offsets=False):
    """
    Takes a line of text and creates a CoNLL09Example instance from it.
    """
    tokenized = nltk.tokenize.word_tokenize(text.lstrip().rstrip())

    # offsets to rebuild things
    offsets = []
    offset = 0
    for token in tokenized:
        offset = text.find(token, offset)
        offsets.append([offset, offset+len(token)])
        offset += len(token)

    pos_tagged = [p[1] for p in nltk.pos_tag(tokenized)]

    lemmatized = [lemmatizer.lemmatize(tokenized[i]) 
                    if not pos_tagged[i].startswith("V") else lemmatizer.lemmatize(tokenized[i], pos='v') 
                    for i in range(len(tokenized))]

    conll_lines = ["{}\t{}\t_\t{}\t_\t{}\t{}\t_\t_\t_\t_\t_\t_\t_\tO\n".format(
        i+1, tokenized[i], lemmatized[i], pos_tagged[i], index) for i in range(len(tokenized))]
    elements = [CoNLL09Element(conll_line) for conll_line in conll_lines]

    sentence = Sentence(syn_type=None, elements=elements)
    if hasattr(sentence, 'tokens'):
        instance = CoNLL09Example(sentence, elements)
    else:
        instance = None
    
    if get_offsets:
        return instance, offsets
    else:
        return instance
Beispiel #26
0
    def load_sentences(self, mode):
        filename = self._get_sentence_filename(mode)
        self.sentences = []  # clear sentences since we will reload it

        with open(os.path.join(self.dev_path, filename)) as f:
            data = json.load(f)

        for row in data["sentences"]:
            sentence = Sentence(
                row["start_time"],
                row["end_time"],
                self.input_locale,
                row["original_text"],
            )
            if mode == SentenceIoMode.TRANSLATE:
                for translation in row["translated_sentences"]:
                    if translation["lang_code"] not in (self.target_languages +
                                                        [self.input_language]):
                        continue

                    sentence.translated_sentences[
                        translation["lang_code"]] = TranslatedSentence(
                            translation["lang_code"], translation["text"])
            self.sentences.append(sentence)

        # validate sentence start_times
        prev_sentence_start = 0
        for idx, sentence in enumerate(self.sentences):
            if sentence.start_time <= prev_sentence_start:
                raise ValueError(
                    f"Sentence {idx} has an invalid start time (it starts too soon)"
                )
            prev_sentence_start = sentence.start_time
    def predict(self, test_path, output_path, ftype):
        """Predict the test set.

        Args:
            test_path: str - The path of test set.
            output_path: str - The path of output file.
            ftype: str - Indicating the feature type.

        Return:
            list(Sentence) - The sentence with predicted labels.
        """
        # reading the training set.
        io = self._io
        sentences = []
        for words, poss, labels in io.read_sentences(test_path):
            sentences.append(Sentence(labels, words, poss, self._nerdic))

        for sent in sentences:
            feats, labels = self._prepare_feats([sent], ftype)
            confidence = []
            predict_ids = []
            for learner in self._learners:
                confidence.append(learner.confidence(feats))

            confidence = np.array(confidence)
            confidence = confidence.transpose()

            for con in confidence:
                predict_ids.append(np.argmax(con))

            # predict_ids = self._second_learner.predict(confidence)
            sent.add_predict(predict_ids)

        io.write_sentences(output_path, sentences)
Beispiel #28
0
def main(argv):
    """Compute the sentence frequency of each term"""

    # How many sentences does each word appear in?
    lexicon = defaultdict(lambda: set())

    for arg in argv:
        with open(arg, 'r') as fin:
            sentences = list()
            for line in fin:
                s = Util.tokenize(line, Summarizer.non_space)
                sentence = []
                for w in s:
                    sentence.append(w)
                    if Summarizer.sentence_terminator.search(w):
                        sent = Sentence(sentence, Summarizer.punctuation,
                                        Summarizer.stopwords,
                                        Summarizer.stemmer)
                        sentences.append(sent)
                        sentence = []

            for sent in sentences:
                for w in sent.stemmed:
                    lexicon[w].add(sent)  # set() will de-duplicate

        sf = {}
        for w in lexicon:
            sf[w] = len(lexicon[w])

        #print sf
        with open('sf.dat', 'wb') as out:
            pickle.dump(sf, out)
Beispiel #29
0
    def __call__(self, *coms: tuple[Union[list, Sentence, int,
                                          Callable]]) -> None:
        """ Używane do manipulacji historią

            Możliwe argumenty:
                - `Sentence`    - dodaje formułę do historii 
                - `Callable`    - wykonuje operacje `callable(history)` na obiekcie historii, a wynik nadpisuje jako nową historię; traktuj ją jako `set`
                - `int`         - wykonuje jedną z predefiniowanych operacji:
                    -  0 - operacja pusta
                    - -1 - czyszczenie historii

            :raises TypeError: Typ nie jest obsługiwany 
        """
        for num, command in enumerate(coms):
            if isinstance(command, Sentence):
                self.add_sentence(command)
            elif isinstance(command, list):
                self.add_sentence(Sentence(command))
            elif isinstance(command, function):
                self = History(command(self))
            elif isinstance(command, int):
                if command == -1:  # Clear set
                    self.clear()
                elif command == 0:  # Pass
                    pass
            else:
                raise TypeError(
                    f"Historia nie przyjmuje typu {type(command).__name__} (komenda {num+1}.)"
                )
Beispiel #30
0
    def parse_operator(self, scope):
        """Parse an operator from the stream and create an operator sentence.

        This one does not enforce the shape of operator sentences. It is just
        a bit too complex to do cleanly, so it happens on the define.

        :return: A Sentence, may be an operator sentence or might just be
            an expression."""
        token = next(self._token_stream)
        node = Sentence()
        part_match = scope.new_matcher()
        for token in self._token_stream:
            if isinstance(token, OperToken):
                if part_match.next(token):
                    node.append(token)
                elif part_match.end():
                    self._token_stream.push_back(token)
                    return node
                else:
                    raise ParseError('Sentence not matched.', node)
            elif isinstance(token, FirstToken):
                self._token_stream.push_back(token)
                if part_match.next():
                    node.append(self.parse_expression(scope))
                elif part_match.end():
                    return node if 1 < len(node) else node[0]
                else:
                    raise ParseError('Sentence not matched.', node)
            else:
                self._token_stream.push_back(token)
                if part_match.end():
                    return node
                else:
                    raise ParseError('Sentence not matched.', node)
        raise ParseError('Operator not closed')