Python split_multi Examples, segtok.segmenter.split_multi Python Examples

Example #1

0

Show file

File: index.py Project: EggplantElf/creepy

def index(client, freq_file, lang):
    tweets = client['twitter_'+lang]['tweets']
    # freq_dict = load(freq_file)
    freq_dict = defaultdict(int)

    i = 0
    for tweet in tweets.find():
        i += 1
        if i % 100000 == 0:
            print i
        # tweets.update({'_id': tweet['_id']}, {'$set': {'indexed': True}})
        # text = tweet['text'].lower()
        text = tweet['text']
        text = re.sub(filter_pattern, '', text)
        for sent in split_multi(text):
            for word in word_tokenizer(sent):
                freq_dict[word] += 1

    # for the second db (tr)
    tweets = client['new_'+lang]['tweets']
    for tweet in tweets.find():
        i += 1
        if i % 100000 == 0:
            print i
        # tweets.update({'_id': tweet['_id']}, {'$set': {'indexed': True}})
        # text = tweet['text'].lower()
        text = tweet['text']
        text = re.sub(filter_pattern, '', text)
        for sent in split_multi(text):
            for word in word_tokenizer(sent):
                freq_dict[word] += 1


    save(freq_file, freq_dict)

Example #2

0

Show file

File: utilities.py Project: databill86/dac

def tokenize(text,
             segment=True,
             norm=True,
             unique=False,
             min_len=2,
             max_sent=0):
    '''
    Tokenize text using SegTok segmenter and tokenizer.
    '''
    sentences = split_multi(text) if segment else [text]

    tokens = []

    for i, s in enumerate(sentences):
        if max_sent and i >= max_sent:
            break
        tokens += word_tokenizer(s)

    if unique:
        tokens = list(set(tokens))

    if min_len:
        tokens = [t for t in tokens if len(t) >= min_len]

    if norm:
        tokens = [w for t in tokens for w in normalize(t).split()]

    return tokens

Example #3

0

Show file

File: ner_utils.py Project: btski/geoboost2

def tokenize_text(doc_text):
    '''Tokenize the text and preserve offsets'''
    # Split text into sentences using segtok, then words and create Token objects
    sents = [sent for sent in split_multi(doc_text) if sent.strip() != ""]
    doc_tokens = []
    current_offset = 0
    for sent in sents:
        sent_tokens = []
        words = re.split(SPLIT_REGEX, sent)
        words = [word.strip() for word in words if word.strip() != ""]
        for word in words:
            word_offset = doc_text.index(word, current_offset)
            current_offset = word_offset + len(word)
            word = unidecode(word)
            sent_token = Token(word, word_offset, word_offset + len(word),
                               TOKEN_O)
            sent_tokens.append(sent_token)
        if sent_tokens:
            sent_start = sent_tokens[0].start
            sent_end = sent_tokens[-1].end
        # Update sentence offsets
        for token in sent_tokens:
            token.sent_start = sent_start
            token.sent_end = sent_end
        doc_tokens.append(sent_tokens)
    return doc_tokens

Example #4

0

Show file

File: DataRepresentation.py Project: UFMG-Database-lab/TGA

 def __build_graph__(self):
     stopwords = get_stopwords(self.lan)
     stem = get_stem(self.lan).stem
     self.G = nx.Graph()
     sentences_str = [[
         w for w in split_contractions(web_tokenizer(s))
         if not (w.startswith("'") and len(w) > 1) and len(w) > 0
     ] for s in list(split_multi(self.text)) if len(s.strip()) > 0]
     for sentence in sentences_str:
         buffer = []
         for word in sentence:
             if len([
                     c for c in word if c in EXCLUDE
             ]) == len(word) or word.lower() in stopwords or word.replace(
                     '.', '').replace(',', '').replace('-', '').isnumeric():
                 continue
             else:
                 #stemmed_word = lemma(word).lower()
                 stemmed_word = stem(word)
                 if stemmed_word not in self.G:
                     self.G.add_node(stemmed_word, TF=0)
                 self.G.node[stemmed_word]['TF'] += 1
                 for (idx_cooccur,
                      word_cooccur) in enumerate(buffer[-self.w:]):
                     self.__add_cooccur__(word_cooccur, stemmed_word,
                                          idx_cooccur + 1)
                 buffer.append(stemmed_word)
     self.__build_linegraph__()

Example #5

0

Show file

File: extract_relations.py Project: nahgnaw/sci-kb

def single_extraction(sentences):
    with open('config/logging_config.yaml') as f:
        logging.config.dictConfig(yaml.load(f))
    logger = logging.getLogger('single_relation_extraction')

    parser_server = 'http://127.0.0.1:8084'

    for sent in split_multi(sentences):
        sent = sent.strip()
        if sent:
            logger.debug(u'SENTENCE: {}'.format(sent))
            try:
                extractor = RelationExtractor(sent, parser_server, logger, entity_linking_flag=False)
            except:
                logger.error(u'Failed to parse the sentence', exc_info=True)
            else:
                extractor.extract_spo()
                for relation in extractor.relations:
                    logger.debug(u'SUBJECT HEAD: {}'.format(relation.subject.head))
                    logger.debug(u'SUBJECT NN HEAD: {}'.format(relation.subject.nn_head))
                    if extractor.entity_linking_flag:
                        logger.debug(u'SUBJECT EL: {}'.format(relation.subject_el))
                    logger.debug(u'OBJECT HEAD: {}'.format(relation.object.head))
                    logger.debug(u'OBJECT NN HEAD: {}'.format(relation.object.nn_head))
                    if extractor.entity_linking_flag:
                        logger.debug(u'OBJECT EL: {}'.format(relation.object_el))
                    logger.debug(u'RELATION LEMMA: {}'.format(relation.lemma))
                    logger.debug(u'RELATION CANONICAL: {}'.format(relation.canonical_form))

Example #6

0

Show file

    def add_document(self, text):
        text = self.pre_filter(text)
        sentences_str = [[
            w for w in split_contractions(web_tokenizer(s))
            if not (w.startswith("'") and len(w) > 1) and len(w) > 0
        ] for s in list(split_multi(text)) if len(s.strip()) > 0]
        self.number_of_sentences += len(sentences_str)
        self.number_of_documents += 1
        pos_text = 0
        document_candidates = {}
        term_in_doc = {}
        sentences_obj = []
        block_of_word_obj = []
        sentence_obj_aux = []
        for (sentence_id, sentence) in enumerate(sentences_str):
            sentence_obj_aux = []
            block_of_word_obj = []
            for (pos_sent, word) in enumerate(sentence):
                if len([
                        c for c in word if c in self.exclude
                ]) == len(word):  # If the word is based on exclude chars
                    if len(block_of_word_obj) > 0:
                        sentence_obj_aux.append(block_of_word_obj)
                        cand = ComposedWord(block_of_word_obj)
                        cand = self.add_or_update_composed_word(cand)
                        if cand.unique_kw not in document_candidates:
                            document_candidates[cand.unique_kw] = cand
                        block_of_word_obj = []
                else:
                    tag = self.get_tag(word, pos_sent)
                    term_obj = self.get_term(word)
                    term_in_doc[term_obj.unique_term] = term_obj
                    term_obj.add_occurrence(tag, sentence_id, pos_sent,
                                            pos_text, self.number_of_documents)
                    pos_text += 1
                    #Create co-occurrence matrix
                    if tag not in self.tagsToDiscard:
                        word_windows = list(
                            range(
                                max(0,
                                    len(block_of_word_obj) - self.windowsSize),
                                len(block_of_word_obj)))
                        for w in word_windows:
                            if block_of_word_obj[w][
                                    0] not in self.tagsToDiscard:
                                self.add_cooccurrence(block_of_word_obj[w][2],
                                                      term_obj)

                    # Add term to the block of words' buffer
                    block_of_word_obj.append((tag, word, term_obj))
            if len(block_of_word_obj) > 0:
                sentence_obj_aux.append(block_of_word_obj)
            if len(sentence_obj_aux) > 0:
                sentences_obj.append(sentence_obj_aux)
        if len(block_of_word_obj) > 0:
            sentence_obj_aux.append(block_of_word_obj)
        if len(sentence_obj_aux) > 0:
            sentences_obj.append(sentence_obj_aux)
        self.number_of_words += pos_text
        return document_candidates, term_in_doc

Example #7

0

Show file

File: lang_lib.py Project: WillemJan/Narralyzer

    def parse(self):
        if self.error:
            return
        for count, sentence in enumerate(split_multi(self.result["text"])):
            self.result["sentences"][count] = {"string": sentence,
                                               "pos": [],
                                               "sentiment": [],
                                               "stanford": [],
                                               "count": count}

        if self.use_threads:
            self._threaded_parser()
        else:
            self._parser()

        if self.use_stats:
            self.stats_all()

        self.result["ners"] = []
        tmp_ners = []

        #TODO: integrate the results from pp
        for item in [self.result.get('sentences').get(s).get('stanford') for s in self.result.get('sentences')]:
            if item and item.get('ners'):
                for ner in item.get('ners'):
                    if ner.get('tag') in ['person', 'per']:
                        self.result["ners"].append(ner.get('string'))

Example #8

0

Show file

File: extract_verbs.py Project: nahgnaw/sci-kb

def extract_from_txt(input_dir):
    if not input_dir.endswith('/'):
        input_dir += '/'

    file_dir = input_dir + 'processed/'
    output_dir = input_dir + 'verbs/'
    if not os.path.exists(os.path.dirname(output_dir)):
        os.makedirs(os.path.dirname(output_dir))

    extractor = VerbExtractor()
    for root, _, files in os.walk(file_dir):
        for fn in files:
            if fn.endswith('.txt'):
                filename = os.path.join(root, fn)
                logging.info('Reading {}'.format(filename))
                f_in = codecs.open(filename, encoding='utf-8')
                text = f_in.read()
                results = {}
                for sent in split_multi(text):
                    verbs = extractor.extract(sent.lower())
                    if verbs:
                        for verb, category in verbs:
                            results.setdefault(category, []).append(verb)
                f_in.close()
                output_file = filename.replace('processed', 'verbs').replace('.txt', '_verbs.txt')
                logging.info('Writing to {}'.format(output_file))
                f_out = codecs.open(output_file, mode='w', encoding='utf-8')
                for category in extractor.bloom_verb_categories:
                    counter = Counter(results[category])
                    output_str = [u'{}({})'.format(item[0], str(item[1])) for item in counter.items()]
                    f_out.write(u'{}: {}\n'.format(category, ', '.join(output_str)))
                f_out.close()

Example #9

0

Show file

File: segmenter.py Project: Gldkslfmsd/concordance-crawler

def sentence_segmentation(text):
	'''gets a text (string), returns list of sentences.
	
	Every item of a list should be one and only one whole sentence. (But in fact there are errors.)
	
	As a 'sentence' we define a sequence of words carrying one whole thought. E.g. the text on this line
	before "E.g." was one sentence. Compound sentence is also 'one sentence'. 'One sentence' is also a title 
	(beware, they are usually not ended by full stop) or a menu item.
	
	This function uses segmentation from `segtok` library, but it
	improves it by one additional rule. It tries to split also sentences that
	are not ended by dot and space, if the dot separates two existing English words present in dictionary.
	'''
	sentences = list(segmenter.split_multi(text))
	out_sentences = []
	for s in sentences:
		sen_words = s.split()
		current_sentence = []
		for w in sen_words:
			if not "http" in w and reg.match(w):
				a = re.split(r"(\)?[.!?:])\(?",w)
				if all(x.lower() in words or (x.lower().endswith("s") and x[:-1] in words) for x in a[:2:2]):
					current_sentence.append(a[0]+a[1])
					out_sentences.append(" ".join(current_sentence))
					current_sentence = ["".join(a[2:])]
				else:
					current_sentence.append(w)
			else:
				current_sentence.append(w)
		out_sentences.append(" ".join(current_sentence))
	return out_sentences

Example #10

0

Show file

File: tokenization.py Project: ydwisroad/competitions

    def split(self, text: str) -> List[Sentence]:
        plain_sentences: List[str] = list(split_multi(text))

        try:
            sentence_offset: Optional[int] = text.index(plain_sentences[0])
        except ValueError as error:
            raise AssertionError(
                f"Can't find the sentence offset for sentence {repr(plain_sentences[0])} "
                f"from the text's starting position") from error

        sentences: List[Sentence] = []
        for sentence, next_sentence in stagger(plain_sentences,
                                               offsets=(0, 1),
                                               longest=True):

            sentences.append(
                Sentence(text=sentence,
                         use_tokenizer=self._tokenizer,
                         start_position=sentence_offset))

            offset: int = sentence_offset + len(sentence)
            try:
                sentence_offset = text.index(
                    next_sentence,
                    offset) if next_sentence is not None else None
            except ValueError as error:
                raise AssertionError(
                    f"Can't find the sentence offset for sentence {repr(sentence)} "
                    f"starting from position {repr(offset)}") from error

        return sentences

Example #11

0

Show file

File: check_dict.py Project: thc2125/creepy

 def tokenize(self, text):
     """
     tokenize the text and filter the @username (and punctuation, smiley ...), leave only words
     """
     words = [] # list of words
     # text = text.decode('utf-8')
     text = filter_pattern.sub(' ', text)
     for sent in split_multi(text):
         for token in word_tokenizer(sent):
             words.append(token.encode('utf-8', 'ignore'))
     return words

Example #12

0

Show file

File: util2.py Project: zachzhang/ehr-transformer-lm

def sentence_prepare(text, vectorizer, sent_len, doc_len, unique=False):

    #from nltk.tokenize import sent_tokenize
    from segtok.segmenter import split_multi

    vocab = vectorizer.vocabulary_
    tokenizer = vectorizer.build_tokenizer()

    #text = [sent_tokenize(doc) for doc in text  ]
    text = [list(split_multi(doc)) for doc in text]

    seq = []
    sent_l = []
    doc_l = []
    for doc in text:
        doc_tok = []
        for sent in doc:
            sent_toks = [vocab[y] for y in tokenizer(sent) if y in vocab]
            doc_tok.append(sent_toks)
            sent_l.append(len(sent_toks))

        seq.append(doc_tok)
        doc_l.append(len(doc_tok))

    sent_l = np.array(sent_l)
    doc_l = np.array(doc_l)

    print("Average Sent Length: ", sent_l.mean())
    print("90% Length: ", np.percentile(sent_l, 90))

    print("Average Doc Length: ", doc_l.mean())
    print("90% Length: ", np.percentile(doc_l, 90))

    #sent_len = np.percentile(sent_l, 90)
    #doc_len = np.percentile(doc_l, 90)

    padded_docs = torch.zeros(len(seq), doc_len, sent_len)

    for i, _doc in enumerate(seq):

        if len(_doc) > doc_len:
            _doc = _doc[:doc_len]
            padded_seq = pad_doc(_doc, sent_len, len(_doc))
        else:
            if len(_doc) == 0:
                continue

            padded_seq = pad_doc(_doc, sent_len, doc_len)

        padded_docs[i] = padded_seq

    return padded_docs

Example #13

0

Show file

File: tokenizer.py Project: romanklinger/news_crawler

def tokenize_old(output_file, db = 'crawler'):
    client = MongoClient()
    texts = client[db]['texts']
    f = open(output_file, 'w')
    # (TODO: some query to get specific data)
    for entry in texts.find():
        text = entry['text'].decode('utf-8', 'ignore')
        # (optional: write article level data)
        for sent in split_multi(text):
            for token in word_tokenizer(sent):
                f.write('%s\t%s\n' % (token.encode('utf-8', 'ignore'), 'X'))
            f.write('\n')
    f.close()

Example #14

0

Show file

 def preprocessing(self):
     #dévisé le texte entre phrases
     sentences = split_multi(self.text)
     #obtenir les morseaux de texte
     self.Chuncks = self.chunks(sentences)
     self.chunckDict = []
     wordDict = dict()
     for chunck in self.Chuncks:
         wordDict = dict()
         for word in range(len(chunck)):
             #get the the set of words
             if chunck[word].lower() not in self.tokens:
                 self.tokens[chunck[word].lower()]
             wordDict[chunck[word]] = self.getNameTag(chunck[word], word)
             self.chunckDict.append(wordDict)

Example #15

0

Show file

File: index.py Project: EggplantElf/creepy

def read(origin_file, freq_file, lang):
    freq_dict = defaultdict(int)
    i = 0
    for line in open(origin_file):
        i += 1
        if i % 100000 == 0:
            print i
        items = line.strip().split(',', 3)
        if len(items) == 4 and items[0] == lang:
            # text = items[3].lower().decode('utf-8')
            text = items[3].decode('utf-8')
            text = re.sub(filter_pattern, '', text)
            for sent in split_multi(text):
                for word in word_tokenizer(sent):
                    freq_dict[word] += 1
    save(freq_file, freq_dict)

Example #16

0

Show file

def read_sentences_from_file(path_to_file, one_sentence_per_line=True):
    lines = []
    with io.open(path_to_file, mode="r", encoding="utf-8") as file:
        for line in file:
            line = line.strip()
            if line != "":
                lines.append(line.strip())

    if one_sentence_per_line:
        sentences = lines
    else:
        text = " ".join(lines)
        sentences = list(split_multi(text))
        sentences = [sentence for sentence in sentences if sentence != ""]

    return sentences

Example #17

0

Show file

File: tokenizer.py Project: EggplantElf/news_crawler

def tokenize_on_date(output_file, date = '2015-07-05'):
    client = MongoClient()
    texts = client['crawler']['texts']
    f = open(output_file, 'w')
    # (TODO: some query to get specific data)
    for entry in texts.find({'date': date}):
        text = entry['text'].decode('utf-8', 'ignore')
        # (optional: write article level data)
        for sent in split_multi(text):
            for token in word_tokenizer(sent):
                if re.search("'s$", token):
                    f.write('%s\t%s\n' % (token[:-2].encode('utf-8', 'ignore'), 'X'))
                    f.write('%s\t%s\n' % (token[-2:].encode('utf-8', 'ignore'), 'X'))
                else:
                    f.write('%s\t%s\n' % (token.encode('utf-8', 'ignore'), 'X'))
                    
            f.write('\n')
    f.close()

Example #18

0

Show file

File: check.py Project: EggplantElf/creepy

    def tokenize(self, tweets):
        """
        tokenize the text and filter the @username (and punctuation, smiley ...), leave only words
        """

        counts = [] # [5, 12, 0, 3, ...] the counts of valid words for each tweet
        words = [] # list of words
        # out = '' # one-word-per-line string of the tokenized words for morph analysis
        
        for (text, tid, uid) in tweets:
            i = 0
            text = filter_pattern.sub(' ', text)
            for sent in split_multi(text):
                for token in word_tokenizer(sent):
                    # words.append(token.lower().encode('utf-8', 'ignore'))
                    words.append(token.encode('utf-8', 'ignore'))
                    i += 1
            counts.append(i)
        return words, counts

Example #19

0

Show file

File: gen_iob2_files.py Project: btski/geoboost2

def tokenize_document(doc_path):
    '''Tokenize the text and preserve offsets'''
    with codecs.open(doc_path, 'r', "utf-8") as myfile:
        doc_text = myfile.read()
    # Split text into sentences using segtok, then words and create Token objects
    sents = [sent for sent in split_multi(doc_text) if sent.strip() != ""]
    doc_tokens = []
    current_offset = 0
    for sent in sents:
        sent_tokens = []
        words = re.split(SPLIT_REGEX, sent)
        words = [word.strip() for word in words if word.strip() != ""]
        for word in words:
            word_offset = doc_text.index(word, current_offset)
            current_offset = word_offset + len(word)
            sent_token = Token(word, word_offset, word_offset + len(word),
                               TOKEN_O)
            sent_tokens.append(sent_token)
        doc_tokens.append(sent_tokens)
    return doc_tokens, doc_text

Example #20

0

Show file

File: check.py Project: thc2125/creepy

    def tokenize(self, tweets):
        """
        tokenize the text and filter the @username (and punctuation, smiley ...), leave only words
        """

        counts = [
        ]  # [5, 12, 0, 3, ...] the counts of valid words for each tweet
        words = []  # list of words
        # out = '' # one-word-per-line string of the tokenized words for morph analysis

        for (text, tid, uid) in tweets:
            i = 0
            text = filter_pattern.sub(' ', text)
            for sent in split_multi(text):
                for token in word_tokenizer(sent):
                    # words.append(token.lower().encode('utf-8', 'ignore'))
                    words.append(token.encode('utf-8', 'ignore'))
                    i += 1
            counts.append(i)
        return words, counts

Example #21

0

Show file

File: preprocess.py Project: nahgnaw/sci-kb

 def __iter__(self):
     for root, _, files in os.walk(self._raw_text_dir):
         for fn in files:
             if fn.endswith('.txt'):
                 filename = os.path.join(root, fn)
                 f = codecs.open(filename, encoding='utf-8')
                 text = f.read()
                 if text:
                     text = self.process_text(text)
                     for sent in split_multi(text):
                         # Discard very long and very short sentences
                         if sent and len(sent) < 1000 and len(sent.split()) > 2:
                             sent = sent.strip()
                             yield filename, sent
                 f.close()
                 # Move the preprocessed files to a temp directory, so we know which files are done.
                 done_filename = filename.replace('/raw/', '/raw_preprocessed/')
                 if not os.path.exists(os.path.dirname(done_filename)):
                     os.makedirs(os.path.dirname(done_filename))
                 os.rename(filename, done_filename)

Example #22

0

Show file

    def Features_computation(self):

        for word in self.tokens:
            Tfa = self.get_word_tags(word, "a")
            TfU = self.get_word_tags(word, "U")
            #calcule TCase
            self.termsCalcule[word].TCase = max(
                Tfa, TfU) / math.log(1 + math.log(self.terms[word].TF))
            #TPos
            self.termsCalcule[word].TPos = math.log(
                3 + median(self.terms[word].offsets_sentences))
            #TFNorm
            validTFs = [
                self.terms[term].TF for term in self.tokens
                if not self.stopWord
            ]
            avgTF = mean(validTFs)
            stdTF = stdev(validTFs)
            self.termsCalcule[word].TFNorm = self.terms[word].TF / (avgTF +
                                                                    stdTF)
            #len( split_multi(self.text))
            self.termsCalcule[
                word].TSent = self.terms[word].offsets_sentences / len(
                    split_multi(self.text))

            #TRel
            maxTF = max([self.terms[term].TF for term in self.tokens])
            try:
                DL = self.calcule_DL(self.cooccur,
                                     self.cooccur(word))[1] / self.calcule_DL(
                                         self.cooccur, self.cooccur(word))[0]
            except:
                DL = 0
            try:
                DR = self.calcule_DR(self.cooccur,
                                     self.cooccur(word))[1] / self.calcule_DR(
                                         self.cooccur, self.cooccur(word))[0]
            except:
                DR = 0
            self.termsCalcule[word].TRel = 1 + (DL + DR) * (
                self.terms[word].TF / maxTF)

Example #23

0

Show file

def clean_int(sents):
    introduction = []
    paragraph = []
    for sent in sents:
        sent = sent.lower()

        sent = re.sub('[~|\'|\"|``|\t|\n]', ' ', sent)
        sent = re.sub('{.*}', ' @cite', sent)
        sent = re.sub('\(.*\)', ' @remark', sent)
        sent = re.sub('\[.*\]', ' @cite', sent)

        sent = re.sub('e\s*\.g\s*\.\s*,', ' e.g., ', sent)
        sent = re.sub('e\s*\.g\s*\.\s*', ' e.g., ', sent)
        sent = re.sub('etc\s*\.', ' etc. ', sent)
        sent = re.sub('et\s*al\s*\.\s*,', ' et al., ', sent)
        sent = re.sub('i\s*\.e\s*\.\s*,', ' i.e., ', sent)
        sent = re.sub('[;|:]', '. ', sent)
        sent = re.sub(',[\s|,]*,', ', ', sent)
        sent = re.sub('\s*\.\s*', '. ', sent)

        tmp = split_multi(sent)

        res = []

        for each in tmp:
            if 'i.e' not in each and 'e.g' not in each:
                x = sent_tokenize(each)
                for y in x:
                    y = y.split()
                    if len(y) < 5: continue
                    y = ' '.join(y)
                    res.append(y)
                continue
            each = each.split()
            if len(each) < 5: continue
            each = ' '.join(each)

            res.append(each)
        paragraph.append(len(res))
        introduction.extend(res)
    return introduction, paragraph

Example #24

0

Show file

 def compute_term_statistics(self):
     sentencesL = sent_tokenize(self.text)
     sentencesL = list(map(lambda sente: sente.lower(), sentencesL))
     sentences = split_multi(self.text)
     chuncks = self.chunks(sentences)
     for chunck in chuncks:
         for word in range(len(chunck)):
             if chunck[word] not in self.stopWord and len(chunck[word]) > 3:
                 # calcule the TF of word
                 self.terms[chunck[word]].TF += 1
                 # calcule the sum of position of sentence where word existe
                 self.terms[chunck[
                     word]].offsets_sentences = self.__getSumIndexSents(
                         sentencesL, chunck[word])
                 for j in range(self.window):
                     try:
                         if (chunck[word],
                                 chunck[j -
                                        word]) not in self.cooccur.keys():
                             self.cooccur[(chunck[word],
                                           chunck[j - word])] = 0
                         elif self.occurance(chunck, chunck[word],
                                             chunck[j - word]):
                             self.cooccur[(chunck[word],
                                           chunck[j - word])] += 1
                     except:
                         pass
                     try:
                         if (chunck[word],
                                 chunck[j +
                                        word]) not in self.cooccur.keys():
                             self.cooccur[(chunck[word],
                                           chunck[j + word])] = 0
                         elif self.occurance(chunck, chunck[word],
                                             chunck[j + word]):
                             self.cooccur[(chunck[word],
                                           chunck[j + word])] += 1
                     except:
                         pass

Example #25

0

Show file

File: tokenization.py Project: zzg-971030/flair

    def split(self, text: str) -> List[Sentence]:
        sentences = []
        offset = 0

        plain_sentences = split_multi(text)
        for sentence in plain_sentences:
            sentence_offset = text.find(sentence, offset)

            if sentence_offset == -1:
                raise AssertionError(
                    f"Can't find offset for sentences {plain_sentences} "
                    f"starting from {offset}")

            sentences += [
                Sentence(text=sentence,
                         use_tokenizer=self._tokenizer,
                         start_position=sentence_offset)
            ]

            offset += len(sentence)

        return sentences

Example #26

0

Show file

File: summarizer.py Project: tianlunte/wanish

def get_plain_text(cleaned_html_node, summary_sentences_qty):
    """
    Summarizes text from html element.

    :param cleaned_html_node: html node to extract text sentences
    :param summary_sentences_qty: quantity of sentences of summarized text
    :return: summarized text, two-digit language code
    """
    clean_text = ""

    # assembling text only with complete sentences, ended with respective punctuations.
    for node in cleaned_html_node.iter('p'):
        if node.text is not None:
            for sentence in split_multi(node.text):
                if len(sentence) > 0 and sentence[-1:] in ['.', '!', '?', '…'] and \
                        not sentence.strip(' .!?…').isdigit() and not dialog_re.match(sentence):
                    clean_text = clean_text + ' ' + sentence

    # creating summary, obtaining language code and total sentences quantity
    final_result, lang_code, sent_qty = create_referat(clean_text, '', summary_sentences_qty)

    return final_result, lang_code

Example #27

0

Show file

File: summarizer.py Project: tianlunte/wanish

def textrank(text, hdr):
    # finding out the most possible language of the text
    lang_code = lang_identifier.classify(' '.join([hdr, text]))[0]

    # tokenizing for words
    sentences = [sentence for sentence in split_multi(text)]

    stemmer = snowballstemmer.stemmer(LANG_CODES.get(lang_code, 'english'))

    words = [set(stemmer.stemWord(word) for word in word_tokenizer(sentence.lower()) if word.isalpha())
             for sentence in sentences]

    pairs = combinations(range(len(sentences)), 2)
    scores = [(i, j, similarity(words[i], words[j])) for i, j in pairs]
    scores = filter(lambda x: x[2], scores)

    g = nx.Graph()
    g.add_weighted_edges_from(scores)
    pr = nx.pagerank(g)

    return sorted(((i, pr[i], s) for i, s in enumerate(sentences) if i in pr),
                  key=lambda x: pr[x[0]], reverse=True), lang_code

Example #28

0

Show file

    def split(self, text: str) -> List[Sentence]:
        plain_sentences: List[str] = split_multi(text)
        sentence_offset = 0

        sentences: List[Sentence] = []
        for sentence in plain_sentences:
            try:
                sentence_offset = text.index(sentence, sentence_offset)
            except ValueError as error:
                raise AssertionError(
                    f"Can't find the sentence offset for sentence {repr(sentence)} "
                    f"starting from position {repr(sentence_offset)}"
                ) from error
            sentences.append(
                Sentence(
                    text=sentence,
                    use_tokenizer=self._tokenizer,
                    start_position=sentence_offset,
                ))

            sentence_offset += len(sentence)

        return sentences

Example #29

0

Show file

File: utilities.py Project: databill86/dac

def segment(text):
    '''
    Split text into sentences using SegTok segmenter.
    '''
    return split_multi(text)

Example #30

0

Show file

File: segmenter_test.py Project: theq629/segtok

 def test_multi_spans(self):
     self.assertSequenceEqual(SPAN_TEST_ANSWER,
                              list(split_multi(SPAN_TEST_TEXT)))

Example #31

0

Show file

File: segmenter_test.py Project: pombredanne/segtok

 def test_multiline(self):
     text = "This is a\nmultiline sentence. And this is Mr.\nAbbrevation."
     ml_sentences = ["This is a\nmultiline sentence.", "And this is Mr.\nAbbrevation."]
     self.assertSequenceEqual(ml_sentences, list(split_multi(text)))

Example #32

0

Show file

File: Tokenizer.py Project: radpet/team_a

 def sentence_tokenizer(self, string):
     tokenized_sentences = list(split_multi(string))
     return tokenized_sentences

Example #33

0

Show file

File: demo_segtok.py Project: Gldkslfmsd/concordance-crawler

from segtok.segmenter import split_multi
text = open("abbrev._text","r").read()
sentences = split_multi(text)

import results
results.print_results(sentences)

#for i in sentences:
#	print((i,))

Example #34

0

Show file

File: datarepresentation.py Project: aakashjsr/interest_miner_api

    def _build(self, text, windowsSize, n):
        text = self.pre_filter(text)
        self.sentences_str = [
            [
                w
                for w in split_contractions(web_tokenizer(s))
                if not (w.startswith("'") and len(w) > 1) and len(w) > 0
            ]
            for s in list(split_multi(text))
            if len(s.strip()) > 0
        ]
        self.number_of_sentences = len(self.sentences_str)
        pos_text = 0
        block_of_word_obj = []
        sentence_obj_aux = []
        for (sentence_id, sentence) in enumerate(self.sentences_str):
            sentence_obj_aux = []
            block_of_word_obj = []
            for (pos_sent, word) in enumerate(sentence):
                if len([c for c in word if c in self.exclude]) == len(
                    word
                ):  # If the word is based on exclude chars
                    if len(block_of_word_obj) > 0:
                        sentence_obj_aux.append(block_of_word_obj)
                        block_of_word_obj = []
                else:
                    tag = self.getTag(word, pos_sent)
                    term_obj = self.getTerm(word)
                    term_obj.addOccur(tag, sentence_id, pos_sent, pos_text)
                    pos_text += 1

                    # Create co-occurrence matrix
                    if tag not in self.tagsToDiscard:
                        word_windows = list(
                            range(
                                max(0, len(block_of_word_obj) - windowsSize),
                                len(block_of_word_obj),
                            )
                        )
                        for w in word_windows:
                            if block_of_word_obj[w][0] not in self.tagsToDiscard:
                                self.addCooccur(block_of_word_obj[w][2], term_obj)
                    # Generate candidate keyphrase list
                    candidate = [(tag, word, term_obj)]
                    cand = composed_word(candidate)
                    self.addOrUpdateComposedWord(cand)
                    word_windows = list(
                        range(
                            max(0, len(block_of_word_obj) - (n - 1)),
                            len(block_of_word_obj),
                        )
                    )[::-1]
                    for w in word_windows:
                        candidate.append(block_of_word_obj[w])
                        self.freq_ns[len(candidate)] += 1.0
                        cand = composed_word(candidate[::-1])
                        self.addOrUpdateComposedWord(cand)

                    # Add term to the block of words' buffer
                    block_of_word_obj.append((tag, word, term_obj))

            if len(block_of_word_obj) > 0:
                sentence_obj_aux.append(block_of_word_obj)

            if len(sentence_obj_aux) > 0:
                self.sentences_obj.append(sentence_obj_aux)

        if len(block_of_word_obj) > 0:
            sentence_obj_aux.append(block_of_word_obj)

        if len(sentence_obj_aux) > 0:
            self.sentences_obj.append(sentence_obj_aux)

        self.number_of_words = pos_text

Example #35

0

Show file

 def test_multiline(self):
     text = "This is a\nmultiline sentence. And this is Mr.\nAbbrevation."
     ml_sentences = [
         "This is a\nmultiline sentence.", "And this is Mr.\nAbbrevation."
     ]
     self.assertSequenceEqual(ml_sentences, list(split_multi(text)))