Python Sentence Beispiele, sentence.Sentence Python Beispiele

Beispiel #1

0

Datei anzeigen

Datei: generate_train_free.py Projekt: koth/kcws

def processLine(line, vob, out):
    global totalLine
    global longLine
    global totalChars
    ss = line.split("\t")

    sentence = Sentence()
    nn = len(ss)
    for i in range(nn):
        ts = ss[i].split(" ")
        ustr = unicode(ts[0].decode('utf8'))
        sentence.addToken(ustr)
    if sentence.chars > MAX_LEN:
        longLine += 1
    else:
        x = []
        y = []
        totalChars += sentence.chars
        sentence.generate_tr_line(x, y, vob)
        nn = len(x)
        assert (nn == len(y))
        for j in range(nn, MAX_LEN):
            x.append(0)
            y.append(0)
            line = ''
        for i in range(MAX_LEN):
            if i > 0:
                line += " "
            line += str(x[i])
        for j in range(MAX_LEN):
            line += " " + str(y[j])
        out.write("%s\n" % (line))
    totalLine += 1

Beispiel #2

0

Datei anzeigen

Datei: lexer.py Projekt: Larch-Team/Larch

    def generate(self, sentence: Sentence, type_: str) -> token_type:
        """
        Generuje nowy token dla danego typu

        :param sentence: Zdanie, w którym zostanie użyty
        :type sentence: Sentence
        :param type_: Typ tokenu
        :type type_: str
        :return: Token w formie `[typ]_[leksem]`
        :rtype: str
        """
        assert type_ in self.generator_regexes, "Type doesn't exist in this Lexicon"
        if type_ in self.find_new:
            new_lexems = generate(self.generator_regexes[type_])
            used_lexems = sentence.getLexems()

            try:
                while (new_lex := next(new_lexems)) in used_lexems:
                    pass
            except StopIteration:
                raise LexError(f"Need more lexems for the {type_} type")
            else:
                return f"{type_}_{new_lex}"

        else:
            counted = Counter(
                (l for t, l in sentence.getItems() if l == type_)).items()
            try:
                new_lex = max(counted, key=lambda x: x[1])[0]
            except ValueError:
                return f"{type_}_{getone(self.generator_regexes[type_])}"
            else:
                return f"{type_}_{new_lex}"

Beispiel #3

0

Datei anzeigen

def extract_features(wordseg, ql, qr):
    sent_l = Sentence()
    sent_l.raw_form = ql
    sent_l.base_form = ql
    wordseg_out = wordseg.segment(ql)
    sent_l.basic_words = wordseg_out.basic_words

    sent_r = Sentence()
    sent_r.raw_form = qr
    sent_r.base_form = qr
    wordseg_out = wordseg.segment(qr)
    sent_r.basic_words = wordseg_out.basic_words

    l_periph = get_periph(sent_l, qr)
    r_periph = get_periph(sent_r, ql)

    feature_dict = {}

    lexical_features = calc_lexical_features(sent_l, sent_r)
    feature_dict.update(lexical_features)

    periph_lexical_features = calc_periph_lexical_features(l_periph, r_periph)
    feature_dict.update(periph_lexical_features)

    return feature_dict

Beispiel #4

0

Datei anzeigen

def extract_features(wordseg, ql, qr, tfidf_count_hash_vectorModels):
    sent_l = Sentence()
    sent_l.raw_form = ql
    sent_l.base_form = ql
    wordseg_out = wordseg.segment(ql) 
    sent_l.basic_words = wordseg_out.basic_words
    
    feature_dict = {}
    sent_r = Sentence()
    sent_r.raw_form = qr
    sent_r.base_form = qr
    wordseg_out = wordseg.segment(qr) 
    sent_r.basic_words = wordseg_out.basic_words
    
    l_notion = get_notional_tokens(sent_l)
    r_notion = get_notional_tokens(sent_r)
    
    count_tfidf_hash_features = get_tfidf_count_hash_features(sent_l, sent_r, tfidf_count_hash_vectorModels)
    feature_dict.update(count_tfidf_hash_features)
    
    notion_count_tfidf_hash_features = get_tfidf_count_hash_features(l_notion, r_notion, tfidf_count_hash_vectorModels, "notion")
    feature_dict.update(notion_count_tfidf_hash_features)
    
    for k in feature_dict:
        print(k)
    
    return feature_dict

Beispiel #5

0

Datei anzeigen

    def load_sentences(self, mode):
        filename = self._get_sentence_filename(mode)
        self.sentences = []  # clear sentences since we will reload it

        with open(os.path.join(self.dev_path, filename)) as f:
            data = json.load(f)

        for row in data["sentences"]:
            sentence = Sentence(
                row["start_time"],
                row["end_time"],
                self.input_locale,
                row["original_text"],
            )
            if mode == SentenceIoMode.TRANSLATE:
                for translation in row["translated_sentences"]:
                    if translation["lang_code"] not in (self.target_languages +
                                                        [self.input_language]):
                        continue

                    sentence.translated_sentences[
                        translation["lang_code"]] = TranslatedSentence(
                            translation["lang_code"], translation["text"])
            self.sentences.append(sentence)

        # validate sentence start_times
        prev_sentence_start = 0
        for idx, sentence in enumerate(self.sentences):
            if sentence.start_time <= prev_sentence_start:
                raise ValueError(
                    f"Sentence {idx} has an invalid start time (it starts too soon)"
                )
            prev_sentence_start = sentence.start_time

Beispiel #6

0

Datei anzeigen

Datei: utils.py Projekt: avst34/nlp

def read_spmrl_conll_file(spmrl_conll_filename):
    """
    Read a SPMRL .conll file and return list of sentences

    The input file is a SPMRL file converted to conll format by the convert_mst.py script
    Legacy code (try to use read_conll_file instead)
    """

    f = codecs.open(spmrl_conll_filename)
    lines = f.readlines()
    f.close()
    sentences = []
    tokens = []
    lemmas = []
    poses = []
    labels = []
    parents = []
    for line in lines:
        if line.strip() == '':
            s = Sentence(tokens, poses, labels, parents)
            s.set_lemmas(lemmas)
            sentences.append(s)
            tokens = []
            lemmas = []
            poses = []
            labels = []
            parents = []
        else:
            splt = line.strip().split()
            tokens.append(splt[1])
            lemmas.append(splt[2])
            poses.append(splt[4])  # use pos and not cpos
            labels.append(splt[7])
            parents.append(int(splt[6]))
    return sentences

Beispiel #7

0

Datei anzeigen

def get_original_text(conn, courseid, questionid):
    """
    Parameters:
        conn: A mysql connection
        courseid: String, '201英语一' or '202英语二'
        questionid: Integer

    Return:
        original text of translation questions of '201英语一', class Sentence after preprocessing.
        None if courseid is '202英语二'

    """
    original_text = ""
    if courseid == '202英语二':
        return None
    try:
        get_original_sql = "SELECT original From standards WHERE courseid=%s AND questionid=%s"
        get_original_cur = conn.cursor()
        get_original_cur.execute(get_original_sql, (courseid, questionid))
        original_text = get_original_cur.fetchone()[0]
    except Exception as e:
        print("Error getting original text of courseid", courseid, "questionid", questionid)
        print(traceback.print_exc())
    finally:
        get_original_cur.close()
    # print(original_text)
    original = Sentence(text=original_text, language='en')
    original.preprocess()
    return original

Beispiel #8

0

Datei anzeigen

def convert_spacy_format(text):
    parsed_text = []
    # instantiate Spacy's parser
    parser = English()
    # parse text via Spacy's parser
    doc = parser(unicode(text, "utf-8"))
    for sent in doc.sents:
        s = Sentence()
        s.string = str(sent)
        word_index = 0
        for token in sent:
            # problem: sometimes, spacy interprets a new line in a text-file wrongly and provides an empty token.
            # solved: by the following condition
            if len(token.orth_) > 1:
                # Spacy's tags for each word in the sentence are stored in a new Word-object
                w = Word()
                w.string = token.orth_
                w.lemma = token.lemma_
                w.index = word_index
                # less verbose tags are provided by "token.pos_"
                w.tag = token.tag_
                w.entity = token.ent_type_
                word_index += 1
                # each word is appended to a Sentence-object
                s.words.append(w)
        # each Sentence-object is appended to an array
        parsed_text.append(s)
    return parsed_text

Beispiel #9

0

Datei anzeigen

Datei: __utils__.py Projekt: Larch-Team/Larch

def pop_part(sentence: Sentence, split_type: str, sent_num: int):
    """
    Zwraca n-te podzdanie (podział według obiektów split_type) usuwając je ze zdania
    """
    split_count = 0
    start_split = 0
    for s in sentence:
        if s.startswith(f"{split_type}_"):
            split_count += 1
        if split_count == sent_num:
            break
        start_split += 1

    if len(sentence) <= start_split or split_count < sent_num:
        raise IndexError("sent_num is too big")

    part = []
    if split_count > 0:
        sentence.pop(start_split)

    while start_split < len(sentence) and not sentence[start_split].startswith(
            f"{split_type}_"):
        part.append(sentence.pop(start_split))

    if len(sentence) > 0 and split_count == 0:
        sentence.pop(start_split)
    return part

Beispiel #10

0

Datei anzeigen

    def summarize(self, document_path):
        sentences = {}
        counter = 0

        with open(document_path, 'r') as f:
            for line in f:
                s = Util.tokenize(line, Summarizer.non_space)
                sentence = []
                for w in s:
                    sentence.append(w)
                    if Summarizer.sentence_terminator.search(w):
                        sent = Sentence(sentence, Summarizer.punctuation, Summarizer.stopwords, Summarizer.stemmer)
                        sentences[sent] = (sent.tfidf(self.tf, self.df, Summarizer.NUM_DOCS), counter)
                        sentence = []
                        counter += 1

        totalWords = 0
        selected = []
        already_included = set()
        # Use the tf-idf score to sort the sentences
        for sent in sorted(sentences, key=lambda x: sentences[x][0], reverse=True):
            if sent not in already_included: # no duplicates
                already_included.add(sent)
                selected.append(sent)
                totalWords += sent.getLength()
                if totalWords > 100:
                    break

        # return the selected sentences in their order of appearance in the document
        return sorted(selected, key=lambda x: sentences[x][1])

Beispiel #11

0

Datei anzeigen

 def start(self, tag, attrib):
     self.tag = tag
     if tag == 'sentences':
         self.parse_sent = True
     elif tag == 'sentence':
         if self.parse_sent:
             self.sent = Sentence(int(attrib['id']) - 1)
     elif tag == 'dependencies':
         if attrib['type'] == consts.corenlp_dependency_type \
                 and self.parse_sent:
             self.parse_dep = True
             self.copied_dep = False
     elif tag == 'dep':
         if self.parse_dep:
             self.dep_label = attrib['type']
             if 'extra' in attrib:
                 self.extra = True
     elif tag == 'governor':
         if self.parse_dep:
             self.gov_idx = int(attrib['idx']) - 1
             if 'copy' in attrib:
                 self.copied_dep = True
     elif tag == 'dependent':
         if self.parse_dep:
             self.dep_idx = int(attrib['idx']) - 1
             if 'copy' in attrib:
                 self.copied_dep = True
     elif tag == 'coreference':
         if not self.parse_coref:
             self.parse_coref = True
         self.coref = Coreference(len(self.corefs))
     elif tag == 'mention':
         if self.parse_coref:
             if 'representative' in attrib:
                 self.rep = True

Beispiel #12

0

Datei anzeigen

Datei: tfidf.py Projekt: ChenluJi/cs224n-project

    def summarize(self, document_path):
        sentences = {}
        counter = 0

        with open(document_path, 'r') as f:
            for line in f:
                s = Util.tokenize(line, Summarizer.non_space)
                sentence = []
                for w in s:
                    sentence.append(w)
                    if Summarizer.sentence_terminator.search(w):
                        sent = Sentence(sentence, Summarizer.punctuation, Summarizer.stopwords, Summarizer.stemmer)
                        sentences[sent] = (sent.tfidf(self.tf, self.df, Summarizer.NUM_DOCS), counter)
                        sentence = []
                        counter += 1

        totalWords = 0
        selected = []
        already_included = set()
        # Use the tf-idf score to sort the sentences
        for sent in sorted(sentences, key=lambda x: sentences[x][0], reverse=True):
            if sent not in already_included: # no duplicates
                already_included.add(sent)
                selected.append(sent)
                totalWords += sent.getLength()
                if totalWords > 100:
                    break

        # return the selected sentences in their order of appearance in the document
        return sorted(selected, key=lambda x: sentences[x][1])

Beispiel #13

0

Datei anzeigen

def extract_features(wordseg, ql, qr, sent_word2vec, word_weights, sent_model):
    sent_l = Sentence()
    sent_l.raw_form = ql
    sent_l.base_form = ql
    wordseg_out = wordseg.segment(ql)
    sent_l.basic_words = wordseg_out.basic_words

    sent_r = Sentence()
    sent_r.raw_form = qr
    sent_r.base_form = qr
    wordseg_out = wordseg.segment(qr)
    sent_r.basic_words = wordseg_out.basic_words

    l_notion = get_notional_tokens(sent_l)
    r_notion = get_notional_tokens(sent_r)

    feature_dict = {}

    sentvec_features = get_sentvec_features(sent_word2vec, word_weights,
                                            sent_model, sent_l, sent_r)
    feature_dict.update(sentvec_features)
    sentvec_features = get_sentvec_features(sent_word2vec, word_weights,
                                            sent_model, sent_l, sent_r,
                                            "notion")
    feature_dict.update(sentvec_features)

    for k, value in feature_dict.items():
        print(k)
        print(value)

    return feature_dict

Beispiel #14

0

Datei anzeigen

def extract_features(wordseg, ql, qr, tfidf_count_hash_vectorModels,
                     sent_word2vec, sent_vocab_dict, sent_model):

    sent_l = Sentence()
    sent_l.raw_form = ql
    sent_l.base_form = ql
    wordseg_out = wordseg.segment(ql)
    sent_l.basic_words = wordseg_out.basic_words

    feature_dict = {}
    sent_r = Sentence()
    sent_r.raw_form = qr
    sent_r.base_form = qr
    wordseg_out = wordseg.segment(qr)
    sent_r.basic_words = wordseg_out.basic_words

    lexical_features = calc_lexical_features(sent_l, sent_r)
    feature_dict.update(lexical_features)

    count_tfidf_hash_features = get_tfidf_count_hash_features(
        sent_l, sent_r, tfidf_count_hash_vectorModels)
    feature_dict.update(count_tfidf_hash_features)

    sentvec_features = get_sentvec_features(sent_word2vec, sent_vocab_dict,
                                            sent_model, sent_l, sent_r)
    feature_dict.update(sentvec_features)

    return feature_dict

Beispiel #15

0

Datei anzeigen

def search(sentence):
    s = Sentence(sentence)
    word_list = s.segment().filter().word_list
    found_list = findSimilarWords.by_word_list(word_list, 20)
    if len(found_list) != 0:
        return json.dumps(found_list)
    else:
        return json.dumps([])

Beispiel #16

0

Datei anzeigen

Datei: storage_endpoint.py Projekt: kebattai/dummy-dockerized-microservices

def _consume_message(ch, method, properties, body):
    try:
        sentence = Sentence()
        sentence.from_json(body)
        _persist_sentence(sentence)
        mq_handler.ack(method)
    except Exception as e:
        logging.error("Could not consume message: " + body + ".\nException:" + str(e))

Beispiel #17

0

Datei anzeigen

Datei: translation_endpoint.py Projekt: kebattai/dummy-dockerized-microservices

def _consume_message(ch, method, properties, body):
    try:
        original_sentence = Sentence()
        original_sentence.from_json(body)
        _translate_to_all_languages(original_sentence)
        mq_handler.ack(method)
    except Exception as e:
        logging.error(
            "Could not consume message: " + body + ".\nException:" + str(e))

Beispiel #18

0

Datei anzeigen

    def test_one_word(self):
        word_expected = "sentence"
        text = "sentence."

        sentence = Sentence(text)

        (word_actual, length) = sentence.get_longest_word()
        self.assertEqual(word_actual, word_expected)
        self.assertEqual(len(word_actual), len(word_expected))

Beispiel #19

0

Datei anzeigen

    def test_ignores_extra_whitespace(self):
        longest_word_expected = "antidisestablismentarianism"
        text = "hello, 		my   name  is      antidisestablismentarianism."

        sentence = Sentence(text)

        (longest_word_actual, length) = sentence.get_longest_word()
        self.assertEqual(longest_word_actual, longest_word_expected)
        self.assertEqual(len(longest_word_actual), len(longest_word_expected))

Beispiel #20

0

Datei anzeigen

Datei: test_sentence.py Projekt: donkirkby/translateoff

def test_select():
    expected_selected = list('以     ')
    expected_source = list('可 我想我。')
    sentence = Sentence('我想我可以。', 'I think I can.', state='可以我想我。')

    sentence.select(1)

    assert expected_selected == sentence.selected
    assert expected_source == sentence.source

Beispiel #21

0

Datei anzeigen

def init_dataset(src_path, tgt_path, src_out, tgt_out):
    with open(src_path) as fp_in_src, open(tgt_path) as fp_in_tgt:
        with open(src_out, 'w') as fp_out_src, open(tgt_out,
                                                    'w') as fp_out_tgt:
            for src_l, tgt_l in tqdm(zip(fp_in_src, fp_in_tgt)):
                src, tgt = Sentence.deserialize(src_l), Sentence.deserialize(
                    tgt_l)
                fp_out_src.write("{}|{}\n".format(src.source, tgt.key))
                fp_out_tgt.write("{}\n".format(tgt.source))

Beispiel #22

0

Datei anzeigen

Datei: cdcl.py Projekt: MariusUrbonas/SatSolver

 def __init__(self, input_file, heuristic):
     if input_file is not None:
         clause_list = self.read_input(input_file)
     variable_set = set(map(abs,
                            itertools.chain.from_iterable(clause_list)))
     self.model = Model(variable_set)
     self.sentence = Sentence(clause_list, self.model)
     self.root_stage = 0
     self.heuristic = heuristic

Beispiel #23

0

Datei anzeigen

    def __iter__(self):
        with open(self.fr_file) as fp_fr, open(self.en_file) as fp_en:
            for i, (line_fr, line_en) in enumerate(zip(fp_fr, fp_en)):
                if self.max_line and i > self.max_line:
                    break

                fr = Sentence(line_fr.strip(), 'fr', i, self.name)
                en = Sentence(line_en.strip(), 'en', i, self.name)

                yield fr, en

Beispiel #24

0

Datei anzeigen

    def __init__(self, n):
        Sentence.__init__(self, n)
        self.knowledge = {}

        for line in open(ngrams_dir + str(n) + '.gram').read().splitlines():
            data = line.split(', ')
            word = data[0]
            self.knowledge[word] = []
            for i in range(1, len(data)):
                self.knowledge[word].append(data[i])

Beispiel #25

0

Datei anzeigen

 def declare(kb, input):
     while input:
         sent_str, input = Sentence.next(input)
         sent_type = Sentence.classify(sent_str)
         if sent_type == 'fact':
             fact = Facts.parse_fact(sent_str)
             kb.add_fact(fact)
         elif sent_type == 'rule':
             rule = Rules.parse_rule(sent_str)
             kb.add_rule(rule)

Beispiel #26

0

Datei anzeigen

Datei: test_sentence.py Projekt: yukota/elpod

class TestSemantics(TestCase):
    def setUp(self):
        self.sentence = Sentence()

    def test_generate(self):
        #self.sentence.generate(['東京', 'とは'])
        pass

    def test_get_posid(self):
        self.sentence._get_posid(['東京', 'とは'])

Beispiel #27

0

Datei anzeigen

Datei: generate_ngrams.py Projekt: ndjuric/guess-noga-code

def main(n):
    db = DB(db_params)
    result = db.query(
        'select activityDescription,noga1,noga2,noga3,noga4,noga5 from CONTACTS where noga1!="" AND '
        'activityDescription != "N/A" limit 10000')

    knowledge = {}
    for item in result:

        description = item[0].lower()
        nogas = [item[1], item[2], item[3], item[4], item[5]]
        nogas = filter(None, nogas)
        noga = str(nogas[0])

        sentence_proc = Sentence(n)
        sentence_proc.set_desc(description)
        sentence_proc.tokenize()
        sentence_proc.generate_ngrams()

        all_ngrams = sentence_proc.get_ngrams()

        for ngram in all_ngrams:
            if ngram not in knowledge:
                knowledge[ngram] = []
            knowledge[ngram].append(noga)

    for key, value in knowledge.iteritems():
        value = most_common(value, 5)
        value = ', '.join(value)
        print '{0}, {1}'.format(key, value)

Beispiel #28

0

Datei anzeigen

 def calculate_sentences(self):
     self.topic_scores_dic = {}
     self.subsentence_detail_dic = {}
     self.score_detail_dic = {}
     self.word_count_dic = {}
     for index, sentence in self.sentences_dic.items():
         s = Sentence(sentence, parser, "all_dic.csv", self.devide_topic_n,
                      self.divide_word_n)
         self.topic_scores_dic[index] = s.topic_score
         self.score_detail_dic[index] = s.model_details()
         self.subsentence_detail_dic[index] = s.subsentence_detail

Beispiel #29

0

Datei anzeigen

def read_sentences(file, speakers):
    """Reads sentence data from file and parses it with the speaker data."""
    bulk_data = read_text(file)

    sentence_indecies = []
    sentences = []

    # Use these for index bounds of each block of sentence data.
    x = -1
    y = -1

    # Find the beginning and end indecies for each sentence block.
    for idx in range(0, len(bulk_data)):
        if len(bulk_data[idx]) > 0:
            if bulk_data[idx][0] == '1':
                x = idx
        else:
            y = idx

        if x > -1 and y > -1:
            sentence_indecies.append((x, y))
            x = -1
            y = -1

    # Use the bounds of each sentence block to extract each sentence.
    # Making the assumption that while each word has a speaker ID, all sentence
    # blocks are uttered by the same person.  This may not be true, but we
    # don't really care that much about the speaker data.
    for block in sentence_indecies:
        data_block = bulk_data[block[0]:block[1]]

        # Lets extract the speaker information.
        speaker_ID = data_block[0][-1]
        this_speaker = None
        for speaker in speakers:
            if speaker_ID == speaker.sid:
                this_speaker = speaker
                break

        this_word_list = []
        this_sentence_list = []
        for data in data_block:
            word_data = data.split('\t')
            this_word_list.append((word_data[1], word_data[3]))
            this_sentence_list.append(word_data[1])

        sentence_text = " ".join(this_sentence_list)

        this_sentence = Sentence(this_speaker, sentence_text, this_word_list)
        this_sentence.lem()

        sentences.append(this_sentence)

    return sentences

Beispiel #30

0

Datei anzeigen

Datei: app.py Projekt: jimmcgaw/sentiment

def index():
    guess = u''
    bag_of_words = u''
    query = u''
    if request.method == 'POST':
        query = request.form.get('q', '')
        if query != '':
            sentence = Sentence(query, 1)
            bag_of_words = sentence.bag_of_words()
            guess = classifier.classify(bag_of_words)
            guess = u'Positive' if guess == 'pos' else u'Negative'
    return render_template('index.html', guess=guess, query=query, bag_of_words=bag_of_words)

Beispiel #31

0

Datei anzeigen

def extract_features(wordseg, ql, qr, tfidf_count_hash_vectorModels,
                     sent_word2vec, sent_vocab_dict, sent_model, ner_dict,
                     syn_dict):
    sent_l = Sentence()
    sent_l.raw_form = ql
    sent_l.base_form = ql
    wordseg_out = wordseg.segment(ql)
    sent_l.basic_words = wordseg_out.basic_words

    feature_dict = {}
    sent_r = Sentence()
    sent_r.raw_form = qr
    sent_r.base_form = qr
    wordseg_out = wordseg.segment(qr)
    sent_r.basic_words = wordseg_out.basic_words

    l_notion = get_notional_tokens(sent_l)
    r_notion = get_notional_tokens(sent_r)

    l_periph = get_periph(sent_l, qr)
    r_periph = get_periph(sent_r, ql)
    #----------------------------------------------------------------------------------------------------------

    lexical_features = calc_lexical_features(sent_l, sent_r)
    feature_dict.update(lexical_features)
    periph_lexical_features = calc_periph_lexical_features(l_periph, r_periph)
    feature_dict.update(periph_lexical_features)

    mt_features = calc_mt_features(sent_l, sent_r)
    feature_dict.update(mt_features)

    count_tfidf_hash_features = get_tfidf_count_hash_features(
        sent_l, sent_r, tfidf_count_hash_vectorModels)
    feature_dict.update(count_tfidf_hash_features)
    notion_count_tfidf_hash_features = get_tfidf_count_hash_features(
        l_notion, r_notion, tfidf_count_hash_vectorModels, signature="notion")
    feature_dict.update(notion_count_tfidf_hash_features)

    sentvec_features = get_sentvec_features(sent_word2vec, sent_vocab_dict,
                                            sent_model, sent_l, sent_r)
    feature_dict.update(sentvec_features)
    sentvec_features = get_sentvec_features(sent_word2vec,
                                            sent_vocab_dict,
                                            sent_model,
                                            l_notion,
                                            r_notion,
                                            signature="notion")
    feature_dict.update(sentvec_features)

    ner_features = get_ner_features(sent_l, sent_r, ner_dict, syn_dict)
    feature_dict.update(ner_features)

    return feature_dict

Beispiel #32

0

Datei anzeigen

    def predict(self, test_file):
        """
        Predicts The language of every line in given file based on the decision tree Hypothesis.

        :param test_file: path of file containing untagged sentences.
        :return:
        """
        test_file = open(test_file, 'r')
        for line in test_file:
            line_data = Sentence(line, False)
            line_data.tag = self.majority_result(line_data)
            print(line_data.tag)

Beispiel #33

0

Datei anzeigen

Datei: kb_test.py Projekt: LahyUS/First-Order-Logic-with-FOL_BC

 def declare(kb, list_data):  # initialize a knowledge database
     while list_data:
         current_line, list_data = Sentence.read_list(
             list_data)  # return a comment/fact/rule and update list
         type = Sentence.classify(current_line)
         if type == 'fact':
             fact = Fact.parse_fact(current_line)
             kb.add_fact(fact)
             kb.add_fact_predicate(fact.predicate)
         elif type == 'rule':
             rule = Rule.parse_rule(current_line)
             kb.add_rule(rule)
             kb.add_rule_conclusion_predicate(rule.conclusion.predicate)

Beispiel #34

0

Datei anzeigen

    def parse(self):

        current_drill_section = None
        previous_line = None

        for current in self._labels(self._labelfile_path):

            logging.info(current)

            if not current_drill_section or \
               current_drill_section.get_name() != current.section:

                current_drill_section = DrillSection(current.section)

                self._drill_sections.append(current_drill_section)

            if not previous_line:
                previous_line = current
                continue

            assert previous_line.section == current.section

            if previous_line.drill != current.drill:

                example_sentence = Sentence(previous_line.text,
                                            self._audiofile_path,
                                            previous_line.start_s,
                                            previous_line.end_s)

                current_drill_section.set_example(example_sentence)

                previous_line = current

                continue

            assert previous_line.section == current.section
            assert previous_line.drill == current.drill

            teacher_sentence = Sentence(previous_line.text,
                                        self._audiofile_path,
                                        previous_line.start_s,
                                        previous_line.end_s)

            previous_line = None

            student_sentence = Sentence(current.text, self._audiofile_path,
                                        current.start_s, current.end_s)

            new_drill = Drill(teacher_sentence, student_sentence)

            current_drill_section.add_drill(new_drill)

Beispiel #35

0

Datei anzeigen

 def test_equal_operator(self):
     left = Sentence([
         FirstToken('Height'),
         WordToken('of'),
         WordToken('box'),
         PeriodToken()
     ])
     right = Sentence([
         FirstToken('Height'),
         WordToken('of'),
         WordToken('box'),
         PeriodToken()
     ])
     self.assertTrue(left == right)

Beispiel #36

0

Datei anzeigen

Datei: process_reviews.py Projekt: killix/cs224n_final

def compute_polarity_scores():
    for i, sentence_key in enumerate(sorted(filtered_sentences)):
        if i < beginTrain or i > endTrain:
            continue
        print
        print "==============================="
        print "sentence#: " + str(i) + " -- " + str(i - beginTrain + 1) +  "/" + str(endTrain - beginTrain + 1)
        print
        sentence_dict = filtered_sentences[sentence_key]
        subsentence_keys = sorted(filter(lambda x: isinstance(x, int), sentence_dict.keys()))

        for j, subsentence_key in enumerate(subsentence_keys):
            print
            print "-------------------------------"
            print "subsentence#: " + str(j + 1) + "/" + str(len(subsentence_keys))
            print
            sentence = Sentence(sentence_dict[subsentence_key]['sentence'], sentence_key, subsentence_key, foodName)
            concept_polarity, filtered_concept_list = polarity.compute_concept_polarity(foodName, sentence)
            adj_polarity = polarity.compute_adj_polarity(foodName, sentence)
            dep_polarity = polarity.compute_dep_polarity(foodName, sentence)

            business = filter(lambda x:x["business_id"] == inputs[sentence_key]["business_id"], businesses)[0]
            results.append({
                "rating": sentence_dict[subsentence_key]['rating'] if human else dep_polarity,
                "type": "manual_label" if human else "dep_polarity",
                "concept_polarity": concept_polarity,
                "adj_polarity": adj_polarity,
                "dep_polarity": dep_polarity,
                "id": i,
                "sentence": sentence.str_val,
                "sentence_key": sentence_key,
                "subsentence_key": subsentence_key,
                "business_id": inputs[sentence_key]["business_id"],
                "user_id": inputs[sentence_key]["user_id"],
                "votes": inputs[sentence_key]["votes"],
                "stars": inputs[sentence_key]["stars"],
                "lng": business["longitude"],
                "lat": business["latitude"],
                "full_address": business["full_address"],
                "name": business["name"],
                "food": foodName,
                "concepts": sentence.getConcepts(),
                "filtered_concepts": filtered_concept_list
            })

            if human:
                print "rating: " + str(sentence_dict[subsentence_key]['rating'])
            print "concept_polarity: " + str(concept_polarity)
            print "adj_polarity: " + str(adj_polarity)
            print "dep_polarity: " + str(dep_polarity)

Beispiel #37

0

Datei anzeigen

    def parse_signature(self):
        """Take a stream of tokens and create a Signature.

        Signatures have stricter rules than other parts of the language, but
        they are context insensitive and don't have to match definitions.
        This will use tokens from the stream, but may not empty the stream.

        :return: A Sentence reperesenting the sentence."""
        token = next(self._token_stream)
        if not isinstance(token, FirstToken):
            raise ParseError('Invalid start of Signature: ' + str(token))
        node = Sentence([token])
        for token in self._token_stream:
            if isinstance(token, FirstToken):
                self._token_stream.push_back(token)
                node.append(self.parse_signature())
            elif isinstance(token, WordToken):
                node.append(token)
            elif isinstance(token, PeriodToken):
                node.append(token)
                return node
            elif isinstance(token, ValueToken):
                raise ParseError('Parser.parse_signature: ValueToken not '
                                 'allowed in signature.')
            else:
                raise ValueError('Unknown Token Kind: {}'.format(type(token)))
        raise ParseError('Parser.parse_signature: fell out of the loop.')

Beispiel #38

0

Datei anzeigen

Datei: segment_aspects.py Projekt: stasi009/OpinionMining728

def segment_aspects(input_fname,out_fname):
    print "begin loading sentences, ......"
    with open(input_fname,"rt") as inf:
        dd = json.load(inf)
        sentences = [ Sentence.from_json(d) for d in dd]
    print "{} sentences loaded".format(len(sentences))

    ######### for test and debug
    # sentences = random.sample(sentences,2000)
    ######### end test and debug

    seed_aspect_keywords = {
        "Overall":set(["recommend","recommendation","love","return","best","regret","rating"]),
        "Value": set(["value", "price", "quality", "worth"]),
        "Room": set([ "suite", "view", "bed","spacious","noisy","small"]),
        "Location": set(["location", "traffic", "minute", "parking","restaurant","shop","locate","subway","bus","airport","downtown"]),
        "Cleanliness": set(["clean", "dirty", "maintain", "smell"]),
        "Service": set(["staff","check", "help","service","helpful","friendly"]),
        "Business service": set(["business", "center", "computer", "internet","wifi","free"])
    }
    segmenter = AspectSegmentation(sentences,seed_aspect_keywords)
    segmenter.run()

    # save resuls
    print "begin dumping the results, ......"
    save_segmentation_results(segmenter,out_fname)
    print "!!! DONE !!!"

Beispiel #39

0

Datei anzeigen

Datei: review.py Projekt: stasi009/OpinionMining728

 def from_dict(d):
     r = Review()
     r.id = d.get("_id",None)
     r.business_id = d.get("business_id",None)# sometimes, we just don't care business_id
     r.ratings = d.get("ratings",None)
     r.sentences = [Sentence.from_dict(sent_dict) for sent_dict in d["sentences"]]
     return r

Beispiel #40

0

Datei anzeigen

Datei: corpus.py Projekt: harshnisar/hundict

    def add_sentence(self, sen):
        # change tokens to ints
        if self._int_tokens:
            sen = self.tokens_to_ints(sen)

        # create actual Sentence instance
        new_sen = Sentence(sen)
        self._corpus.append(new_sen)

        # filter stopwords
        if hasattr(self, "_stopwords"):
            new_sen.remove_toks(self._stopwords, self._backup)

        # register to index
        sen_index = len(self._corpus) - 1
        for tok in new_sen:
            self._index[tok].add(sen_index)

Beispiel #41

0

Datei anzeigen

Datei: translation_endpoint.py Projekt: kebattai/dummy-dockerized-microservices

def _translate_to(original_sentence, to_language):
    translation = translator.translate(
        original_sentence.get_text(),
        to_language,
        original_sentence.get_language())
    translated_sentence = Sentence()
    translated_sentence.set_author(original_sentence.get_author())
    translated_sentence.set_language(to_language)
    translated_sentence.set_text(translation)
    mq_handler.publish('storage', '', translated_sentence.to_json(), True)

Beispiel #42

0

Datei anzeigen

Datei: tests.py Projekt: stasi009/OpinionMining728

def test_negation_suffix():
    stopwords = common.make_stop_words()
    sentences = [   "I don't like Beijing 123, because it's too expensive", "I cannot 4 run away 56, since I am a grown man",
                    "never ever come back again, I swear to god","without any problem","I don't think I will enjoy it: it might be too spicy" ]
    for index,raw_sent in enumerate(sentences):
        sentence = Sentence.from_raw(raw_sent,stopwords)
        print "\n=========================== [{}]".format(index+1)
        print sentence.raw
        print sentence.words

Beispiel #43

0

Datei anzeigen

Datei: clean_tokenize_save.py Projekt: stasi009/MyKaggle

def tokenize_merge(row):
    allwords = []
    for text in row.iloc[1:].dropna():
        text = text.lstrip("b\'").lstrip("b\"").lstrip("b\'''")
        s = Sentence.from_raw(text,StopWords,neg_mark=True)
        allwords += s.words

    print allwords# show progress
    return allwords

Beispiel #44

0

Datei anzeigen

Datei: iofiles.py Projekt: goncalor/IASD

def read_kb(filename):
    """
    Creates a knowledge base from file in DIMACS format.
    :param filename: File's relative address.
    :return: Sentence instance.
    """
    if not os.path.isfile(filename):
        print('ERROR: iofiles read_kb -> ' + filename + ' not found')
        exit()

    f = open(filename, 'r')

    # consume comments in the preamble. comment lines start with a 'c'
    for line in f:
        if line[0] != 'c':
            break

    # the next line in the file contains info on the number of clauses and
    # variables. the line begins with a 'p' and the format (cnf).
    # example line: 'p cnf 5 3' --> 5 variables and 3 clauses
    (nbvar, nclauses)= [int(i) for i in line.split()[2:]]

    new_kb = Sentence(nbvar)

    # each of the next lines in the file represents a clause. each line ends
    # with a '0'. example line: ' 1 -5 4 0'
    # save the clauses into an object
    for line in f:
        while line[0] == ' ':
            line = line[1:len(line)]

        if line[0] == '%':
            break

        aux_list = list()
        for variable in line.split()[:-1]:   # discard the ending '0'
            variable = int(variable)
            aux_list.append(variable)

        new_kb.add_clause(tuple(aux_list))

    f.close()
    return new_kb

Beispiel #45

0

Datei anzeigen

Datei: nlp.py Projekt: trevorarjeski/translyator

def process_text(full_text, api_key):
    """
    Translates and weighs each sentence and returns a giant JSON
    of the Sentence data.

    :param full_text: text to process
    :return: JSON of text with meta data
    """
    sentences = get_sentences(full_text)
    trans_sentences = translate_ru2en(sentences, api_key)
    sentence_obj_list = []
    # translate the text fully, then get the sentences in English
    for i in xrange(len(sentences)):
        ru_text = sentences[i]
        en_text = trans_sentences[i]
        weight = get_sentence_weight(ru_text)
        new_sentence = Sentence(ru_text, en_text, weight)
        sentence_obj_list.append(new_sentence.get_json())
    return json.dumps(sentence_obj_list)

Beispiel #46

0

Datei anzeigen

Datei: syntagrus_parser.py Projekt: Brinit/nlp

def serialize_sentences(input_file):
    f_in = open(input_file, "rb")
    sentences = []

    context = etree.iterparse(f_in, tag="S")
    for event, sentence_elem in context:
        sent_obj = Sentence()
        for word_elem in sentence_elem.iter("W"):
            idx = word_elem.attrib["ID"]
            lemma = word_elem.attrib["LEMMA"]
            sent_obj.add_word(lemma, int(idx))

            dom = word_elem.attrib["DOM"]
            if dom != "_root":
                link_type = word_elem.attrib["LINK"]
                sent_obj.add_link(int(idx), int(dom), link_type)
        sentences.append(sent_obj)

    return sentences

Beispiel #47

0

Datei anzeigen

Datei: web_page.py Projekt: katryo/task_search

    def _obj_and_predicate_dict_by_wo_from_sentences(self):
        """
        self.sentencesから、「〜〜を〜〜」を見つけて、「AをB」にして返す
        最後に計算する際、同じページでの登場番号の前後で。part-ofを判断する
        """
        results = []
        order = 0
        self._set_subtypes()
        for i, sentence in enumerate(self.sentences):
            if type(sentence) == str:
                sentence = Sentence(text=sentence, query=self.query)

            if not sentence.set_noun_verb_if_good_task():
                continue

            object_term = ObjectTerm(sentence.noun)
            if object_term == 'ましょ':
                pdb.set_trace()

            if object_term.core_noun in constants.STOPWORDS_OF_WEBPAGE_NOUN:
                continue

            distance_between_subtypes = {}
            for subtype in self.subtypes:
                distance = self._distance_between_subtype(i, subtype=subtype)
                distance_between_subtypes[subtype] = distance

            task = Task(object_term=object_term.name,
                        cmp=sentence.cmp,
                        predicate_term=sentence.verb,
                        distance_between_subtypes=distance_between_subtypes,
                        query=self.query,
                        order=order,
                        url=self.url,
                        is_shopping=False,
                        is_official=False,
                        rank=self.rank,
                        sentence=sentence.body)
            results.append(task)
            print('%s_%s_%sというタスクをセットしました' % (sentence.noun, sentence.cmp, sentence.verb))
            order += 1 # 登場の順番
        return results

Beispiel #48

0

Datei anzeigen

Datei: parser.py Projekt: davidgbe/hinton

 def parse(self):
     this_file = os.path.dirname(__file__)
     current_sentence = []
     for line in open(os.path.join(this_file, self.data_path)):
         split_line = line.split()
         if not split_line:
             self.sentences += Sentence.process_sentence_data(current_sentence)
             current_sentence = []
         else:
             (word, pos_tag, source, tag) = split_line
             tag = tag if tag == 'O' else tag.split('-')[1]
             current_sentence.append((word, pos_tag, tag))

Beispiel #49

0

Datei anzeigen

Datei: test_sentence.py Projekt: katryo/task_search

 def setUp(self):
     self.sentence_1 = Sentence('夏野菜へのシフトをスタートさせましょう', '')
     self.sentence_2 = Sentence('クワなどは体格や体力に応じたものを、吟味して選ぶようにしましょう', '')
     self.sentence_3 = Sentence('トイレ掃除方法を、解説していきましょう', '')
     self.sentence_4 = Sentence('地面を掘り上げていきましょう', '')
     self.sentence_5 = Sentence('右に移動してください', '')
     self.sentence_6 = Sentence('じっくり本を読む', '')
     self.sentence_7 = Sentence('水まわりは汚れやすいですから、定期的に掃除してガンコな汚れがつかないように気をつけておきましょう', '')

Beispiel #50

0

Datei anzeigen

Datei: parser.py Projekt: cueneythizli/EventMiner

def convert_pattern_format(text):
    """
    Text is parsed through pattern's parsing function into a standardized format.
    """
    parsed_text = []
    # parse text via Pattern's parser
    pattern_parsed_text = Text(parse(text, relations=True, lemmata=True))
    for sentence in pattern_parsed_text:
        s = Sentence()
        s.string = remove_blanks(sentence.string)
        for word in sentence:
            # Patterns tags for each word in the sentence are stored in a new Word-object
            w = Word()
            w.string = word.string
            w.lemma = word.lemma
            w.index = word.index
            w.tag = word.type
            w.entity = ""
            # each word is appended to a Sentence-object
            s.words.append(w)
        # each Sentence-object is appended to an array
        parsed_text.append(s)
    return parsed_text

Beispiel #51

0

Datei anzeigen

Datei: segment_aspects.py Projekt: stasi009/OpinionMining728

def preproc_save_sentences(filename,raw_sent_stream,extra_stopwords = None):
    stop_words = set(stopwords.words("english"))
    if extra_stopwords is not None:
        stop_words |= set(extra_stopwords)

    with open(filename,"wt") as outf:
        outf.write("[")

        for index,raw_sent in enumerate( raw_sent_stream):
            prev_terminator = '\n' if index ==0 else ',\n'
            sentence = Sentence.from_raw(raw_sent,stop_words)
            if len(sentence.words)>0:
                outf.write(prev_terminator + sentence.dump_json())
                print "{}-th sentence processed and saved".format(index+1)

        outf.write("\n]")

Beispiel #52

0

Datei anzeigen

Datei: tweeter.py Projekt: d-baker/bots

    def bot_init(self):
        """
        Initialize and configure your bot!

        Use this function to set options and initialize your own custom bot
        state (if any).
        """

        self.generator = Bot()
        self.sentencer = Sentence()

        ############################
        # REQUIRED: LOGIN DETAILS! #
        ############################
        self.config['api_key'] = CONSUMER_KEY
        self.config['api_secret'] = CONSUMER_SECRET
        self.config['access_key'] = TOKEN
        self.config['access_secret'] = SECRET


        ######################################
        # SEMI-OPTIONAL: OTHER CONFIG STUFF! #
        ######################################
        
        MINS, HOURS = 60, 60 * 60

        # how often to tweet, in seconds
        #self.config['tweet_interval'] = 60 * 60     # default: 30 minutes

        # use this to define a (min, max) random range of how often to tweet
        # e.g., self.config['tweet_interval_range'] = (5*60, 10*60) # tweets every 5-10 minutes
        self.config['tweet_interval_range'] = (30 * MINS, 3 * HOURS)

        # only reply to tweets that specifically mention the bot
        self.config['reply_direct_mention_only'] = False

        # only include bot followers (and original tweeter) in @-replies
        self.config['reply_followers_only'] = True

        # fav any tweets that mention this bot?
        self.config['autofav_mentions'] = False

        # fav any tweets containing these keywords?
        self.config['autofav_keywords'] = []

        # follow back all followers?
        self.config['autofollow'] = False

Beispiel #53

0

Datei anzeigen

Datei: extract_samples_for_sentiments.py Projekt: stasi009/OpinionMining728

def load_sentences():
    dbname = "tripadvisor_train"
    client = MongoClient()
    db = client[dbname]
    sentisent_collection = db["sentiment_sentences"]

    # cursor = sentisent_collection.find({'sentiment':{'$lte':2}}).skip(99).limit(120)
    cursor = sentisent_collection.aggregate([ {'$match':{'sentiment':3}},
                                              { '$sample': { 'size': 120 } } ])
    for index,sentd in enumerate(cursor):
        sent = Sentence.from_dict(sentd)
        print "\n\n[{}] Aspect: {}, Sentiment: {}".format(index+1,sent.aspect,sent.sentiment)
        print sent.raw
        print "--------------"
        print sent.words

    client.close()

Beispiel #54

0

Datei anzeigen

Datei: basic_agiria.py Projekt: malab/test-python

 def analiza(self, sent):
     sentencia = sent 
     # if self.printing: print(sentencia)
     # sent = Sentencia(sentencia)
     # self.sentencias.append(Sentencia(sentencia))
     # self.sentencias[self.n] = Sentencia
     # self.sentencias[self.n].pals = {}
     l = self.tk.tokenize(sentencia);
     ls = self.sp.split(l,0);
 
     ls = self.mf.analyze(ls);
     ls = self.tg.analyze(ls);
     ls = self.nec.analyze(ls)
     
     ls = self.sen.analyze(ls);
     ls = self.parser.analyze(ls);
     ls = self.dep.analyze(ls);
     
     self.sent = Sentence()
     self.sent.set_sentence(sent)
     self.sent.set_con(self.con)
     
     ## output results
     for s in ls :
         # sent.pals = []
         self.sent.sentence = s
         ws = s.get_words();
         num = 0
         for w in ws :
             self.sent.add_pal(w, num)
             # self.sentencias[len(self.sentencias)-1].pals.append(Pal(w, self.con))
             num += 1
         '''
             print(w.get_form()+" "+w.get_lemma()+" "+w.get_tag()+" "+w.get_senses_string());
         print ("");
         '''
         # self.tr = s.get_parse_tree();
         # self.printTree(self.tr.begin(), 0);
         # self.computeTree(self.tr.begin(), 0);
         
         self.dp = s.get_dep_tree();
         self.computeDepTree(self.dp.begin(), 0)

Beispiel #55

0

Datei anzeigen

Datei: nlpQA.py Projekt: djrenren/nlpQ-A

def import_dataset(filename):
    #....Read input file
    input_file = open(filename, 'r')
    all_lines = input_file.readlines()
    input_file.close()

    #...now, for each line...
    dataset = []
    for line_idx, line in enumerate(all_lines):
        #...preprocess, extract everything....
        try:
            #...construct the sentence with its attributs from raw text...
            new_sentence = Sentence.create_from_raw_text(line)
            dataset.append(new_sentence)
        except Exception as e:
            print("Error found while processing <" + filename + ">, line: " + str(line_idx + 1))
            print(line)
            print(e)

    return dataset

Beispiel #56

0

Datei anzeigen

Datei: tests.py Projekt: stasi009/MyKaggle

def test_sentence():
    stopwords = text_utility.make_stop_words()

    texts = [   "can't is a contraction",
                "she isn't my wife any more",
                "I am not in USA right now",
                "I'm a Chinese",
                "1630 NE Valley Rd, Pullman, WA, 99163, Apt X103",
                "I should've done that thing I didn't do",
                "I don't love her any more",
                "I want to divorce without hesitation",
                "bye, Pullman, bye, USA"]

    for index,text in enumerate(texts):
        sent = Sentence.from_raw(text,stopwords,True)
        print "\n******************** {}".format(index+1)

        print sent.raw
        print "===>"
        print sent.words

Beispiel #57

0

Datei anzeigen

Datei: tag_reviews.py Projekt: stasi009/LearnMyMachine

def print_topics(txt):
    sentence = Sentence.from_raw(txt,stop_words)
    print "\n{}\n".format(sentence.raw)

    coded_words = wordcoder.code(sentence.words)
    bow = dictionary.doc2bow(coded_words)

    topic_distribution = lda_model[bow]
    topic_distribution.sort(key=lambda t: t[1], reverse=True)

    tags = None
    for index, (topic_id, topic_percentage) in enumerate(topic_distribution):
        mt = MixTopic(topic_mapping[topic_id])
        mt.weight(topic_percentage)

        if tags is None:
            tags = mt
        else:
            tags.add(mt)

    tags.normalize()
    print tags

Beispiel #58

0

Datei anzeigen

Datei: db_tasks.py Projekt: stasi009/OpinionMining728

def update_add_neg_suffix(dbname,query_condition):
    stop_words = common.make_stop_words()
    client = MongoClient()
    review_collection = client[dbname]['reviews']

    cursor = review_collection.find(query_condition,{"sentences.raw":1,"sentences.words":1})
    for rindex,rd in enumerate(cursor):
        review = Review.from_dict(rd)

        update_content = {}
        for sindex,sent in enumerate(review.sentences):
            new_sent = Sentence.from_raw(sent.raw,stop_words)
            if set(new_sent.words) != set(sent.words):
                update_content["sentences.{}.words".format(sindex)] = new_sent.words

        if len(update_content)>0:
            result = review_collection.update_one({"_id":review.id},{"$set":update_content})
            if result.modified_count != 1:
                raise Exception("failed to update review<{}>".format(review.id))

        print "{}-th review updated {} sentences".format(rindex+1,len(update_content))

    client.close()

Beispiel #59

0

Datei anzeigen

Datei: extract_samples_for_sentiments.py Projekt: stasi009/OpinionMining728

def sample_split(dbname,num_train,num_test):
    client = MongoClient()
    db = client[dbname]
    sentisent_collection = db.sentiment_sentences

    ################## load and count
    aspect_dist = nltk.FreqDist()
    sentiment_dist = nltk.FreqDist()

    all_samples = []
    cursor = sentisent_collection.aggregate([ { '$sample': { 'size': num_train  + num_test } } ])
    for index,d in enumerate(cursor):
        sent = Sentence.from_dict(d)
        all_samples.append( (sent.words,sent.sentiment) )

        aspect_dist[sent.aspect] +=1
        sentiment_dist[int(sent.sentiment)] +=1
    client.close()

    ################## show statistics
    for k in aspect_dist:
        print '[{}]: {}'.format(k,aspect_dist.freq(k))

    for k in sentiment_dist:
        print '[{}]: {}'.format(k,sentiment_dist.freq(k))

    ################## shuffle
    random.shuffle(all_samples)

    ################## split
    def __dump(filename,data):
        with open(filename,"wb") as outf:
            cPickle.dump(data,outf)

    __dump("sentidata_train_raw.pkl",all_samples[:num_train])
    __dump("sentidata_test_raw.pkl",all_samples[num_train:])