Example #1
0
def index(client, freq_file, lang):
    tweets = client['twitter_'+lang]['tweets']
    # freq_dict = load(freq_file)
    freq_dict = defaultdict(int)

    i = 0
    for tweet in tweets.find():
        i += 1
        if i % 100000 == 0:
            print i
        # tweets.update({'_id': tweet['_id']}, {'$set': {'indexed': True}})
        # text = tweet['text'].lower()
        text = tweet['text']
        text = re.sub(filter_pattern, '', text)
        for sent in split_multi(text):
            for word in word_tokenizer(sent):
                freq_dict[word] += 1

    # for the second db (tr)
    tweets = client['new_'+lang]['tweets']
    for tweet in tweets.find():
        i += 1
        if i % 100000 == 0:
            print i
        # tweets.update({'_id': tweet['_id']}, {'$set': {'indexed': True}})
        # text = tweet['text'].lower()
        text = tweet['text']
        text = re.sub(filter_pattern, '', text)
        for sent in split_multi(text):
            for word in word_tokenizer(sent):
                freq_dict[word] += 1


    save(freq_file, freq_dict)
Example #2
0
def tokenize(text,
             segment=True,
             norm=True,
             unique=False,
             min_len=2,
             max_sent=0):
    '''
    Tokenize text using SegTok segmenter and tokenizer.
    '''
    sentences = split_multi(text) if segment else [text]

    tokens = []

    for i, s in enumerate(sentences):
        if max_sent and i >= max_sent:
            break
        tokens += word_tokenizer(s)

    if unique:
        tokens = list(set(tokens))

    if min_len:
        tokens = [t for t in tokens if len(t) >= min_len]

    if norm:
        tokens = [w for t in tokens for w in normalize(t).split()]

    return tokens
Example #3
0
def tokenize_text(doc_text):
    '''Tokenize the text and preserve offsets'''
    # Split text into sentences using segtok, then words and create Token objects
    sents = [sent for sent in split_multi(doc_text) if sent.strip() != ""]
    doc_tokens = []
    current_offset = 0
    for sent in sents:
        sent_tokens = []
        words = re.split(SPLIT_REGEX, sent)
        words = [word.strip() for word in words if word.strip() != ""]
        for word in words:
            word_offset = doc_text.index(word, current_offset)
            current_offset = word_offset + len(word)
            word = unidecode(word)
            sent_token = Token(word, word_offset, word_offset + len(word),
                               TOKEN_O)
            sent_tokens.append(sent_token)
        if sent_tokens:
            sent_start = sent_tokens[0].start
            sent_end = sent_tokens[-1].end
        # Update sentence offsets
        for token in sent_tokens:
            token.sent_start = sent_start
            token.sent_end = sent_end
        doc_tokens.append(sent_tokens)
    return doc_tokens
 def __build_graph__(self):
     stopwords = get_stopwords(self.lan)
     stem = get_stem(self.lan).stem
     self.G = nx.Graph()
     sentences_str = [[
         w for w in split_contractions(web_tokenizer(s))
         if not (w.startswith("'") and len(w) > 1) and len(w) > 0
     ] for s in list(split_multi(self.text)) if len(s.strip()) > 0]
     for sentence in sentences_str:
         buffer = []
         for word in sentence:
             if len([
                     c for c in word if c in EXCLUDE
             ]) == len(word) or word.lower() in stopwords or word.replace(
                     '.', '').replace(',', '').replace('-', '').isnumeric():
                 continue
             else:
                 #stemmed_word = lemma(word).lower()
                 stemmed_word = stem(word)
                 if stemmed_word not in self.G:
                     self.G.add_node(stemmed_word, TF=0)
                 self.G.node[stemmed_word]['TF'] += 1
                 for (idx_cooccur,
                      word_cooccur) in enumerate(buffer[-self.w:]):
                     self.__add_cooccur__(word_cooccur, stemmed_word,
                                          idx_cooccur + 1)
                 buffer.append(stemmed_word)
     self.__build_linegraph__()
Example #5
0
def single_extraction(sentences):
    with open('config/logging_config.yaml') as f:
        logging.config.dictConfig(yaml.load(f))
    logger = logging.getLogger('single_relation_extraction')

    parser_server = 'http://127.0.0.1:8084'

    for sent in split_multi(sentences):
        sent = sent.strip()
        if sent:
            logger.debug(u'SENTENCE: {}'.format(sent))
            try:
                extractor = RelationExtractor(sent, parser_server, logger, entity_linking_flag=False)
            except:
                logger.error(u'Failed to parse the sentence', exc_info=True)
            else:
                extractor.extract_spo()
                for relation in extractor.relations:
                    logger.debug(u'SUBJECT HEAD: {}'.format(relation.subject.head))
                    logger.debug(u'SUBJECT NN HEAD: {}'.format(relation.subject.nn_head))
                    if extractor.entity_linking_flag:
                        logger.debug(u'SUBJECT EL: {}'.format(relation.subject_el))
                    logger.debug(u'OBJECT HEAD: {}'.format(relation.object.head))
                    logger.debug(u'OBJECT NN HEAD: {}'.format(relation.object.nn_head))
                    if extractor.entity_linking_flag:
                        logger.debug(u'OBJECT EL: {}'.format(relation.object_el))
                    logger.debug(u'RELATION LEMMA: {}'.format(relation.lemma))
                    logger.debug(u'RELATION CANONICAL: {}'.format(relation.canonical_form))
Example #6
0
    def add_document(self, text):
        text = self.pre_filter(text)
        sentences_str = [[
            w for w in split_contractions(web_tokenizer(s))
            if not (w.startswith("'") and len(w) > 1) and len(w) > 0
        ] for s in list(split_multi(text)) if len(s.strip()) > 0]
        self.number_of_sentences += len(sentences_str)
        self.number_of_documents += 1
        pos_text = 0
        document_candidates = {}
        term_in_doc = {}
        sentences_obj = []
        block_of_word_obj = []
        sentence_obj_aux = []
        for (sentence_id, sentence) in enumerate(sentences_str):
            sentence_obj_aux = []
            block_of_word_obj = []
            for (pos_sent, word) in enumerate(sentence):
                if len([
                        c for c in word if c in self.exclude
                ]) == len(word):  # If the word is based on exclude chars
                    if len(block_of_word_obj) > 0:
                        sentence_obj_aux.append(block_of_word_obj)
                        cand = ComposedWord(block_of_word_obj)
                        cand = self.add_or_update_composed_word(cand)
                        if cand.unique_kw not in document_candidates:
                            document_candidates[cand.unique_kw] = cand
                        block_of_word_obj = []
                else:
                    tag = self.get_tag(word, pos_sent)
                    term_obj = self.get_term(word)
                    term_in_doc[term_obj.unique_term] = term_obj
                    term_obj.add_occurrence(tag, sentence_id, pos_sent,
                                            pos_text, self.number_of_documents)
                    pos_text += 1
                    #Create co-occurrence matrix
                    if tag not in self.tagsToDiscard:
                        word_windows = list(
                            range(
                                max(0,
                                    len(block_of_word_obj) - self.windowsSize),
                                len(block_of_word_obj)))
                        for w in word_windows:
                            if block_of_word_obj[w][
                                    0] not in self.tagsToDiscard:
                                self.add_cooccurrence(block_of_word_obj[w][2],
                                                      term_obj)

                    # Add term to the block of words' buffer
                    block_of_word_obj.append((tag, word, term_obj))
            if len(block_of_word_obj) > 0:
                sentence_obj_aux.append(block_of_word_obj)
            if len(sentence_obj_aux) > 0:
                sentences_obj.append(sentence_obj_aux)
        if len(block_of_word_obj) > 0:
            sentence_obj_aux.append(block_of_word_obj)
        if len(sentence_obj_aux) > 0:
            sentences_obj.append(sentence_obj_aux)
        self.number_of_words += pos_text
        return document_candidates, term_in_doc
Example #7
0
    def parse(self):
        if self.error:
            return
        for count, sentence in enumerate(split_multi(self.result["text"])):
            self.result["sentences"][count] = {"string": sentence,
                                               "pos": [],
                                               "sentiment": [],
                                               "stanford": [],
                                               "count": count}

        if self.use_threads:
            self._threaded_parser()
        else:
            self._parser()

        if self.use_stats:
            self.stats_all()

        self.result["ners"] = []
        tmp_ners = []

        #TODO: integrate the results from pp
        for item in [self.result.get('sentences').get(s).get('stanford') for s in self.result.get('sentences')]:
            if item and item.get('ners'):
                for ner in item.get('ners'):
                    if ner.get('tag') in ['person', 'per']:
                        self.result["ners"].append(ner.get('string'))
Example #8
0
def extract_from_txt(input_dir):
    if not input_dir.endswith('/'):
        input_dir += '/'

    file_dir = input_dir + 'processed/'
    output_dir = input_dir + 'verbs/'
    if not os.path.exists(os.path.dirname(output_dir)):
        os.makedirs(os.path.dirname(output_dir))

    extractor = VerbExtractor()
    for root, _, files in os.walk(file_dir):
        for fn in files:
            if fn.endswith('.txt'):
                filename = os.path.join(root, fn)
                logging.info('Reading {}'.format(filename))
                f_in = codecs.open(filename, encoding='utf-8')
                text = f_in.read()
                results = {}
                for sent in split_multi(text):
                    verbs = extractor.extract(sent.lower())
                    if verbs:
                        for verb, category in verbs:
                            results.setdefault(category, []).append(verb)
                f_in.close()
                output_file = filename.replace('processed', 'verbs').replace('.txt', '_verbs.txt')
                logging.info('Writing to {}'.format(output_file))
                f_out = codecs.open(output_file, mode='w', encoding='utf-8')
                for category in extractor.bloom_verb_categories:
                    counter = Counter(results[category])
                    output_str = [u'{}({})'.format(item[0], str(item[1])) for item in counter.items()]
                    f_out.write(u'{}: {}\n'.format(category, ', '.join(output_str)))
                f_out.close()
def sentence_segmentation(text):
	'''gets a text (string), returns list of sentences.
	
	Every item of a list should be one and only one whole sentence. (But in fact there are errors.)
	
	As a 'sentence' we define a sequence of words carrying one whole thought. E.g. the text on this line
	before "E.g." was one sentence. Compound sentence is also 'one sentence'. 'One sentence' is also a title 
	(beware, they are usually not ended by full stop) or a menu item.
	
	This function uses segmentation from `segtok` library, but it
	improves it by one additional rule. It tries to split also sentences that
	are not ended by dot and space, if the dot separates two existing English words present in dictionary.
	'''
	sentences = list(segmenter.split_multi(text))
	out_sentences = []
	for s in sentences:
		sen_words = s.split()
		current_sentence = []
		for w in sen_words:
			if not "http" in w and reg.match(w):
				a = re.split(r"(\)?[.!?:])\(?",w)
				if all(x.lower() in words or (x.lower().endswith("s") and x[:-1] in words) for x in a[:2:2]):
					current_sentence.append(a[0]+a[1])
					out_sentences.append(" ".join(current_sentence))
					current_sentence = ["".join(a[2:])]
				else:
					current_sentence.append(w)
			else:
				current_sentence.append(w)
		out_sentences.append(" ".join(current_sentence))
	return out_sentences
Example #10
0
    def split(self, text: str) -> List[Sentence]:
        plain_sentences: List[str] = list(split_multi(text))

        try:
            sentence_offset: Optional[int] = text.index(plain_sentences[0])
        except ValueError as error:
            raise AssertionError(
                f"Can't find the sentence offset for sentence {repr(plain_sentences[0])} "
                f"from the text's starting position") from error

        sentences: List[Sentence] = []
        for sentence, next_sentence in stagger(plain_sentences,
                                               offsets=(0, 1),
                                               longest=True):

            sentences.append(
                Sentence(text=sentence,
                         use_tokenizer=self._tokenizer,
                         start_position=sentence_offset))

            offset: int = sentence_offset + len(sentence)
            try:
                sentence_offset = text.index(
                    next_sentence,
                    offset) if next_sentence is not None else None
            except ValueError as error:
                raise AssertionError(
                    f"Can't find the sentence offset for sentence {repr(sentence)} "
                    f"starting from position {repr(offset)}") from error

        return sentences
Example #11
0
 def tokenize(self, text):
     """
     tokenize the text and filter the @username (and punctuation, smiley ...), leave only words
     """
     words = [] # list of words
     # text = text.decode('utf-8')
     text = filter_pattern.sub(' ', text)
     for sent in split_multi(text):
         for token in word_tokenizer(sent):
             words.append(token.encode('utf-8', 'ignore'))
     return words
Example #12
0
def sentence_prepare(text, vectorizer, sent_len, doc_len, unique=False):

    #from nltk.tokenize import sent_tokenize
    from segtok.segmenter import split_multi

    vocab = vectorizer.vocabulary_
    tokenizer = vectorizer.build_tokenizer()

    #text = [sent_tokenize(doc) for doc in text  ]
    text = [list(split_multi(doc)) for doc in text]

    seq = []
    sent_l = []
    doc_l = []
    for doc in text:
        doc_tok = []
        for sent in doc:
            sent_toks = [vocab[y] for y in tokenizer(sent) if y in vocab]
            doc_tok.append(sent_toks)
            sent_l.append(len(sent_toks))

        seq.append(doc_tok)
        doc_l.append(len(doc_tok))

    sent_l = np.array(sent_l)
    doc_l = np.array(doc_l)

    print("Average Sent Length: ", sent_l.mean())
    print("90% Length: ", np.percentile(sent_l, 90))

    print("Average Doc Length: ", doc_l.mean())
    print("90% Length: ", np.percentile(doc_l, 90))

    #sent_len = np.percentile(sent_l, 90)
    #doc_len = np.percentile(doc_l, 90)

    padded_docs = torch.zeros(len(seq), doc_len, sent_len)

    for i, _doc in enumerate(seq):

        if len(_doc) > doc_len:
            _doc = _doc[:doc_len]
            padded_seq = pad_doc(_doc, sent_len, len(_doc))
        else:
            if len(_doc) == 0:
                continue

            padded_seq = pad_doc(_doc, sent_len, doc_len)

        padded_docs[i] = padded_seq

    return padded_docs
Example #13
0
def tokenize_old(output_file, db = 'crawler'):
    client = MongoClient()
    texts = client[db]['texts']
    f = open(output_file, 'w')
    # (TODO: some query to get specific data)
    for entry in texts.find():
        text = entry['text'].decode('utf-8', 'ignore')
        # (optional: write article level data)
        for sent in split_multi(text):
            for token in word_tokenizer(sent):
                f.write('%s\t%s\n' % (token.encode('utf-8', 'ignore'), 'X'))
            f.write('\n')
    f.close()
Example #14
0
 def preprocessing(self):
     #dévisé le texte entre phrases
     sentences = split_multi(self.text)
     #obtenir les morseaux de texte
     self.Chuncks = self.chunks(sentences)
     self.chunckDict = []
     wordDict = dict()
     for chunck in self.Chuncks:
         wordDict = dict()
         for word in range(len(chunck)):
             #get the the set of words
             if chunck[word].lower() not in self.tokens:
                 self.tokens[chunck[word].lower()]
             wordDict[chunck[word]] = self.getNameTag(chunck[word], word)
             self.chunckDict.append(wordDict)
Example #15
0
def read(origin_file, freq_file, lang):
    freq_dict = defaultdict(int)
    i = 0
    for line in open(origin_file):
        i += 1
        if i % 100000 == 0:
            print i
        items = line.strip().split(',', 3)
        if len(items) == 4 and items[0] == lang:
            # text = items[3].lower().decode('utf-8')
            text = items[3].decode('utf-8')
            text = re.sub(filter_pattern, '', text)
            for sent in split_multi(text):
                for word in word_tokenizer(sent):
                    freq_dict[word] += 1
    save(freq_file, freq_dict)
Example #16
0
def read_sentences_from_file(path_to_file, one_sentence_per_line=True):
    lines = []
    with io.open(path_to_file, mode="r", encoding="utf-8") as file:
        for line in file:
            line = line.strip()
            if line != "":
                lines.append(line.strip())

    if one_sentence_per_line:
        sentences = lines
    else:
        text = " ".join(lines)
        sentences = list(split_multi(text))
        sentences = [sentence for sentence in sentences if sentence != ""]

    return sentences
Example #17
0
def tokenize_on_date(output_file, date = '2015-07-05'):
    client = MongoClient()
    texts = client['crawler']['texts']
    f = open(output_file, 'w')
    # (TODO: some query to get specific data)
    for entry in texts.find({'date': date}):
        text = entry['text'].decode('utf-8', 'ignore')
        # (optional: write article level data)
        for sent in split_multi(text):
            for token in word_tokenizer(sent):
                if re.search("'s$", token):
                    f.write('%s\t%s\n' % (token[:-2].encode('utf-8', 'ignore'), 'X'))
                    f.write('%s\t%s\n' % (token[-2:].encode('utf-8', 'ignore'), 'X'))
                else:
                    f.write('%s\t%s\n' % (token.encode('utf-8', 'ignore'), 'X'))
                    
            f.write('\n')
    f.close()
Example #18
0
    def tokenize(self, tweets):
        """
        tokenize the text and filter the @username (and punctuation, smiley ...), leave only words
        """

        counts = [] # [5, 12, 0, 3, ...] the counts of valid words for each tweet
        words = [] # list of words
        # out = '' # one-word-per-line string of the tokenized words for morph analysis
        
        for (text, tid, uid) in tweets:
            i = 0
            text = filter_pattern.sub(' ', text)
            for sent in split_multi(text):
                for token in word_tokenizer(sent):
                    # words.append(token.lower().encode('utf-8', 'ignore'))
                    words.append(token.encode('utf-8', 'ignore'))
                    i += 1
            counts.append(i)
        return words, counts
Example #19
0
def tokenize_document(doc_path):
    '''Tokenize the text and preserve offsets'''
    with codecs.open(doc_path, 'r', "utf-8") as myfile:
        doc_text = myfile.read()
    # Split text into sentences using segtok, then words and create Token objects
    sents = [sent for sent in split_multi(doc_text) if sent.strip() != ""]
    doc_tokens = []
    current_offset = 0
    for sent in sents:
        sent_tokens = []
        words = re.split(SPLIT_REGEX, sent)
        words = [word.strip() for word in words if word.strip() != ""]
        for word in words:
            word_offset = doc_text.index(word, current_offset)
            current_offset = word_offset + len(word)
            sent_token = Token(word, word_offset, word_offset + len(word),
                               TOKEN_O)
            sent_tokens.append(sent_token)
        doc_tokens.append(sent_tokens)
    return doc_tokens, doc_text
Example #20
0
    def tokenize(self, tweets):
        """
        tokenize the text and filter the @username (and punctuation, smiley ...), leave only words
        """

        counts = [
        ]  # [5, 12, 0, 3, ...] the counts of valid words for each tweet
        words = []  # list of words
        # out = '' # one-word-per-line string of the tokenized words for morph analysis

        for (text, tid, uid) in tweets:
            i = 0
            text = filter_pattern.sub(' ', text)
            for sent in split_multi(text):
                for token in word_tokenizer(sent):
                    # words.append(token.lower().encode('utf-8', 'ignore'))
                    words.append(token.encode('utf-8', 'ignore'))
                    i += 1
            counts.append(i)
        return words, counts
Example #21
0
 def __iter__(self):
     for root, _, files in os.walk(self._raw_text_dir):
         for fn in files:
             if fn.endswith('.txt'):
                 filename = os.path.join(root, fn)
                 f = codecs.open(filename, encoding='utf-8')
                 text = f.read()
                 if text:
                     text = self.process_text(text)
                     for sent in split_multi(text):
                         # Discard very long and very short sentences
                         if sent and len(sent) < 1000 and len(sent.split()) > 2:
                             sent = sent.strip()
                             yield filename, sent
                 f.close()
                 # Move the preprocessed files to a temp directory, so we know which files are done.
                 done_filename = filename.replace('/raw/', '/raw_preprocessed/')
                 if not os.path.exists(os.path.dirname(done_filename)):
                     os.makedirs(os.path.dirname(done_filename))
                 os.rename(filename, done_filename)
Example #22
0
    def Features_computation(self):

        for word in self.tokens:
            Tfa = self.get_word_tags(word, "a")
            TfU = self.get_word_tags(word, "U")
            #calcule TCase
            self.termsCalcule[word].TCase = max(
                Tfa, TfU) / math.log(1 + math.log(self.terms[word].TF))
            #TPos
            self.termsCalcule[word].TPos = math.log(
                3 + median(self.terms[word].offsets_sentences))
            #TFNorm
            validTFs = [
                self.terms[term].TF for term in self.tokens
                if not self.stopWord
            ]
            avgTF = mean(validTFs)
            stdTF = stdev(validTFs)
            self.termsCalcule[word].TFNorm = self.terms[word].TF / (avgTF +
                                                                    stdTF)
            #len( split_multi(self.text))
            self.termsCalcule[
                word].TSent = self.terms[word].offsets_sentences / len(
                    split_multi(self.text))

            #TRel
            maxTF = max([self.terms[term].TF for term in self.tokens])
            try:
                DL = self.calcule_DL(self.cooccur,
                                     self.cooccur(word))[1] / self.calcule_DL(
                                         self.cooccur, self.cooccur(word))[0]
            except:
                DL = 0
            try:
                DR = self.calcule_DR(self.cooccur,
                                     self.cooccur(word))[1] / self.calcule_DR(
                                         self.cooccur, self.cooccur(word))[0]
            except:
                DR = 0
            self.termsCalcule[word].TRel = 1 + (DL + DR) * (
                self.terms[word].TF / maxTF)
Example #23
0
def clean_int(sents):
    introduction = []
    paragraph = []
    for sent in sents:
        sent = sent.lower()

        sent = re.sub('[~|\'|\"|``|\t|\n]', ' ', sent)
        sent = re.sub('{.*}', ' @cite', sent)
        sent = re.sub('\(.*\)', ' @remark', sent)
        sent = re.sub('\[.*\]', ' @cite', sent)

        sent = re.sub('e\s*\.g\s*\.\s*,', ' e.g., ', sent)
        sent = re.sub('e\s*\.g\s*\.\s*', ' e.g., ', sent)
        sent = re.sub('etc\s*\.', ' etc. ', sent)
        sent = re.sub('et\s*al\s*\.\s*,', ' et al., ', sent)
        sent = re.sub('i\s*\.e\s*\.\s*,', ' i.e., ', sent)
        sent = re.sub('[;|:]', '. ', sent)
        sent = re.sub(',[\s|,]*,', ', ', sent)
        sent = re.sub('\s*\.\s*', '. ', sent)

        tmp = split_multi(sent)

        res = []

        for each in tmp:
            if 'i.e' not in each and 'e.g' not in each:
                x = sent_tokenize(each)
                for y in x:
                    y = y.split()
                    if len(y) < 5: continue
                    y = ' '.join(y)
                    res.append(y)
                continue
            each = each.split()
            if len(each) < 5: continue
            each = ' '.join(each)

            res.append(each)
        paragraph.append(len(res))
        introduction.extend(res)
    return introduction, paragraph
Example #24
0
 def compute_term_statistics(self):
     sentencesL = sent_tokenize(self.text)
     sentencesL = list(map(lambda sente: sente.lower(), sentencesL))
     sentences = split_multi(self.text)
     chuncks = self.chunks(sentences)
     for chunck in chuncks:
         for word in range(len(chunck)):
             if chunck[word] not in self.stopWord and len(chunck[word]) > 3:
                 # calcule the TF of word
                 self.terms[chunck[word]].TF += 1
                 # calcule the sum of position of sentence where word existe
                 self.terms[chunck[
                     word]].offsets_sentences = self.__getSumIndexSents(
                         sentencesL, chunck[word])
                 for j in range(self.window):
                     try:
                         if (chunck[word],
                                 chunck[j -
                                        word]) not in self.cooccur.keys():
                             self.cooccur[(chunck[word],
                                           chunck[j - word])] = 0
                         elif self.occurance(chunck, chunck[word],
                                             chunck[j - word]):
                             self.cooccur[(chunck[word],
                                           chunck[j - word])] += 1
                     except:
                         pass
                     try:
                         if (chunck[word],
                                 chunck[j +
                                        word]) not in self.cooccur.keys():
                             self.cooccur[(chunck[word],
                                           chunck[j + word])] = 0
                         elif self.occurance(chunck, chunck[word],
                                             chunck[j + word]):
                             self.cooccur[(chunck[word],
                                           chunck[j + word])] += 1
                     except:
                         pass
Example #25
0
    def split(self, text: str) -> List[Sentence]:
        sentences = []
        offset = 0

        plain_sentences = split_multi(text)
        for sentence in plain_sentences:
            sentence_offset = text.find(sentence, offset)

            if sentence_offset == -1:
                raise AssertionError(
                    f"Can't find offset for sentences {plain_sentences} "
                    f"starting from {offset}")

            sentences += [
                Sentence(text=sentence,
                         use_tokenizer=self._tokenizer,
                         start_position=sentence_offset)
            ]

            offset += len(sentence)

        return sentences
Example #26
0
def get_plain_text(cleaned_html_node, summary_sentences_qty):
    """
    Summarizes text from html element.

    :param cleaned_html_node: html node to extract text sentences
    :param summary_sentences_qty: quantity of sentences of summarized text
    :return: summarized text, two-digit language code
    """
    clean_text = ""

    # assembling text only with complete sentences, ended with respective punctuations.
    for node in cleaned_html_node.iter('p'):
        if node.text is not None:
            for sentence in split_multi(node.text):
                if len(sentence) > 0 and sentence[-1:] in ['.', '!', '?', '…'] and \
                        not sentence.strip(' .!?…').isdigit() and not dialog_re.match(sentence):
                    clean_text = clean_text + ' ' + sentence

    # creating summary, obtaining language code and total sentences quantity
    final_result, lang_code, sent_qty = create_referat(clean_text, '', summary_sentences_qty)

    return final_result, lang_code
Example #27
0
def textrank(text, hdr):
    # finding out the most possible language of the text
    lang_code = lang_identifier.classify(' '.join([hdr, text]))[0]

    # tokenizing for words
    sentences = [sentence for sentence in split_multi(text)]

    stemmer = snowballstemmer.stemmer(LANG_CODES.get(lang_code, 'english'))

    words = [set(stemmer.stemWord(word) for word in word_tokenizer(sentence.lower()) if word.isalpha())
             for sentence in sentences]

    pairs = combinations(range(len(sentences)), 2)
    scores = [(i, j, similarity(words[i], words[j])) for i, j in pairs]
    scores = filter(lambda x: x[2], scores)

    g = nx.Graph()
    g.add_weighted_edges_from(scores)
    pr = nx.pagerank(g)

    return sorted(((i, pr[i], s) for i, s in enumerate(sentences) if i in pr),
                  key=lambda x: pr[x[0]], reverse=True), lang_code
Example #28
0
    def split(self, text: str) -> List[Sentence]:
        plain_sentences: List[str] = split_multi(text)
        sentence_offset = 0

        sentences: List[Sentence] = []
        for sentence in plain_sentences:
            try:
                sentence_offset = text.index(sentence, sentence_offset)
            except ValueError as error:
                raise AssertionError(
                    f"Can't find the sentence offset for sentence {repr(sentence)} "
                    f"starting from position {repr(sentence_offset)}"
                ) from error
            sentences.append(
                Sentence(
                    text=sentence,
                    use_tokenizer=self._tokenizer,
                    start_position=sentence_offset,
                ))

            sentence_offset += len(sentence)

        return sentences
Example #29
0
def segment(text):
    '''
    Split text into sentences using SegTok segmenter.
    '''
    return split_multi(text)
Example #30
0
 def test_multi_spans(self):
     self.assertSequenceEqual(SPAN_TEST_ANSWER,
                              list(split_multi(SPAN_TEST_TEXT)))
Example #31
0
 def test_multiline(self):
     text = "This is a\nmultiline sentence. And this is Mr.\nAbbrevation."
     ml_sentences = ["This is a\nmultiline sentence.", "And this is Mr.\nAbbrevation."]
     self.assertSequenceEqual(ml_sentences, list(split_multi(text)))
Example #32
0
 def sentence_tokenizer(self, string):
     tokenized_sentences = list(split_multi(string))
     return tokenized_sentences
from segtok.segmenter import split_multi
text = open("abbrev._text","r").read()
sentences = split_multi(text)

import results
results.print_results(sentences)

#for i in sentences:
#	print((i,))


    def _build(self, text, windowsSize, n):
        text = self.pre_filter(text)
        self.sentences_str = [
            [
                w
                for w in split_contractions(web_tokenizer(s))
                if not (w.startswith("'") and len(w) > 1) and len(w) > 0
            ]
            for s in list(split_multi(text))
            if len(s.strip()) > 0
        ]
        self.number_of_sentences = len(self.sentences_str)
        pos_text = 0
        block_of_word_obj = []
        sentence_obj_aux = []
        for (sentence_id, sentence) in enumerate(self.sentences_str):
            sentence_obj_aux = []
            block_of_word_obj = []
            for (pos_sent, word) in enumerate(sentence):
                if len([c for c in word if c in self.exclude]) == len(
                    word
                ):  # If the word is based on exclude chars
                    if len(block_of_word_obj) > 0:
                        sentence_obj_aux.append(block_of_word_obj)
                        block_of_word_obj = []
                else:
                    tag = self.getTag(word, pos_sent)
                    term_obj = self.getTerm(word)
                    term_obj.addOccur(tag, sentence_id, pos_sent, pos_text)
                    pos_text += 1

                    # Create co-occurrence matrix
                    if tag not in self.tagsToDiscard:
                        word_windows = list(
                            range(
                                max(0, len(block_of_word_obj) - windowsSize),
                                len(block_of_word_obj),
                            )
                        )
                        for w in word_windows:
                            if block_of_word_obj[w][0] not in self.tagsToDiscard:
                                self.addCooccur(block_of_word_obj[w][2], term_obj)
                    # Generate candidate keyphrase list
                    candidate = [(tag, word, term_obj)]
                    cand = composed_word(candidate)
                    self.addOrUpdateComposedWord(cand)
                    word_windows = list(
                        range(
                            max(0, len(block_of_word_obj) - (n - 1)),
                            len(block_of_word_obj),
                        )
                    )[::-1]
                    for w in word_windows:
                        candidate.append(block_of_word_obj[w])
                        self.freq_ns[len(candidate)] += 1.0
                        cand = composed_word(candidate[::-1])
                        self.addOrUpdateComposedWord(cand)

                    # Add term to the block of words' buffer
                    block_of_word_obj.append((tag, word, term_obj))

            if len(block_of_word_obj) > 0:
                sentence_obj_aux.append(block_of_word_obj)

            if len(sentence_obj_aux) > 0:
                self.sentences_obj.append(sentence_obj_aux)

        if len(block_of_word_obj) > 0:
            sentence_obj_aux.append(block_of_word_obj)

        if len(sentence_obj_aux) > 0:
            self.sentences_obj.append(sentence_obj_aux)

        self.number_of_words = pos_text
Example #35
0
 def test_multiline(self):
     text = "This is a\nmultiline sentence. And this is Mr.\nAbbrevation."
     ml_sentences = [
         "This is a\nmultiline sentence.", "And this is Mr.\nAbbrevation."
     ]
     self.assertSequenceEqual(ml_sentences, list(split_multi(text)))