Example #1
0
    def sentence_tokenizer(self, untokenized_string, language):
        """Reads language .pickle for right language"""
        if language == 'greek':
            pickle_path = os.path.expanduser('~/cltk_data/greek/cltk_linguistic_data/tokenizers/sentence/greek.pickle')
            language_punkt_vars = PunktLanguageVars
            language_punkt_vars.sent_end_chars = ('.', ';')
            language_punkt_vars.internal_punctuation = (',', '·')
        elif language == 'latin':
            pickle_path = os.path.expanduser('~/cltk_data/latin/cltk_linguistic_data/tokenizers/sentence/latin.pickle')
            language_punkt_vars = PunktLanguageVars
            language_punkt_vars.sent_end_chars = ('.', '?', ':')
            language_punkt_vars.internal_punctuation = (',', ';')
        else:
            print("No sentence tokenizer for this language available.")

        with open(pickle_path, 'rb') as open_pickle:
            tokenizer = pickle.load(open_pickle)
        tokenizer.INCLUDE_ALL_COLLOCS = True
        tokenizer.INCLUDE_ABBREV_COLLOCS = True
        params = tokenizer.get_params()
        sbd = PunktSentenceTokenizer(params)
        tokenized_sentences = []
        for sentence in sbd.sentences_from_text(untokenized_string,
                                                realign_boundaries=True):
            tokenized_sentences.append(sentence)
        return tokenized_sentences
Example #2
0
def featureize(F, observation_files):

    word_tokenizer = PunktSentenceTokenizer()
    sent_tokenizer = PunktSentenceTokenizer()

    m = len(observation_files)

    # X is Nx2
    X = np.zeros((m, 2), dtype=np.float)

    for (i, filename) in enumerate(observation_files, start=0):

        file_text = read_file(filename).decode('string_escape')

        try:
            num_sents = len(sent_tokenizer.sentences_from_text(file_text))
        except UnicodeDecodeError:
            num_sents = 2

        #num_tokens = len(word_tokenize(file_text))
        num_tokens = len(file_text.split())

        # Return two features:
        # 1 (0) - Number of sentences per file
        # 2 (1) - Number of tokens per file
        X[i][0] = num_sents
        X[i][1] = num_tokens

    return X
def featureize(F, observation_files):

    word_tokenizer = PunktSentenceTokenizer()
    sent_tokenizer = PunktSentenceTokenizer()

    m = len(observation_files)

    # X is Nx2
    X = np.zeros((m,2), dtype=np.float)

    for (i,filename) in enumerate(observation_files,start=0):

        file_text  = read_file(filename).decode('string_escape')

        try:
            num_sents = len(sent_tokenizer.sentences_from_text(file_text))
        except UnicodeDecodeError:
            num_sents = 2

        #num_tokens = len(word_tokenize(file_text))
        num_tokens = len(file_text.split())

        # Return two features: 
        # 1 (0) - Number of sentences per file
        # 2 (1) - Number of tokens per file
        X[i][0] = num_sents
        X[i][1] = num_tokens

    return X
Example #4
0
 def tokenize(self):
     """
     Returns a list of tokenized sentences
     """
     sentence_tokenizer = PunktSentenceTokenizer()
     sentences = sentence_tokenizer.sentences_from_text(self.text)
     sentences = [sentence.split() for sentence in sentences]
     sentences = [[word.strip(",.?!") for word in sentence] for sentence in sentences]
     return sentences
Example #5
0
 def tokenize(self):
     '''
     Returns a list of tokenized sentences
     '''
     sentence_tokenizer = PunktSentenceTokenizer()
     sentences = sentence_tokenizer.sentences_from_text(self.text)
     sentences = [sentence.split() for sentence in sentences]
     sentences = [[word.strip(",.?!") for word in sentence]
                  for sentence in sentences]
     return sentences
Example #6
0
 def _iter_text_data(self):
     pst = PunktSentenceTokenizer()
     for fragment in self.fragments:
         text = (fragment.text or '').strip()
         if not text:
             continue
         label = fragment.label
         sentences = pst.sentences_from_text(fragment.text)
         for sentence in sentences:
             yield sentence.encode('utf-8'), label
Example #7
0
def preprocess_doc(doc):
    sentence_tokenizer = PunktSentenceTokenizer()
    sentences = sentence_tokenizer.sentences_from_text(doc)    
    tokens = []
    for sentence in sentences:
        #sentence1 = sentence.split()
        sentence1 = neg_scope(sentence)
        tokens.extend(w for w in sentence1 if w.lower() not in stopwords.words("english"))
    for ii in xrange(len(tokens)):
        if tokens[ii][-1] == '.':
            tokens[ii] = tokens[ii][:-1]
    return tokens
Example #8
0
class GCBlockExtractor(ExtractionMapper):
    def __init__(self):
        super(GCBlockExtractor,
              self).__init__(extraction_function=self._blocks_from_text)
        self.tokenizer = PunktSentenceTokenizer()

    def _blocks_from_text(self, page):
        blocks = []
        for sentence in self.tokenizer.sentences_from_text(
                page.text.replace('\n', '')):
            if sentence.strip():
                blocks.append(len(sentence))
            # maybe count tokens? or non-spaces?
        return blocks
Example #9
0
class GCBlockExtractor(ExtractionMapper):

    def __init__(self):
        super(GCBlockExtractor, self).__init__(
            extraction_function=self._blocks_from_text)
        self.tokenizer = PunktSentenceTokenizer()

    def _blocks_from_text(self, page):
        blocks = []
        for sentence in self.tokenizer.sentences_from_text(
                page.text.replace('\n', '')):
            if sentence.strip():
                blocks.append(len(sentence))
            # maybe count tokens? or non-spaces?
        return blocks
def raw_records(crawl_collection,start):

  # Prepare a naive sentence tokeniser utility
  pst = PunktSentenceTokenizer()

  for rec in crawl_collection.query({'downloaded': True},field=None,skip=start):
    _id     = rec['_id']
    if rec['content'] is None:
      continue
    content = rec['content']['contents']
    # A wiki page may probably comprise of multiple content
    for c in content:
      # Explode a long topic into list of sentences
      sentences = pst.sentences_from_text(c)
      for s in sentences:
        yield (_id,s)
Example #11
0
def tokenize_sents_latin(sentences_string):
    global tokenenized_sentences
    """Tokenize a Latin string into sentences"""
    pickle_name = 'latin.pickle'
    pickle_path = os.path.join(cltk_data, 'compiled', 'sentence_tokens_latin/', pickle_name)
    with open(pickle_path, 'rb') as f:
        train_data = pickle.load(f)
    train_data.INCLUDE_ALL_COLLOCS = True
    train_data.INCLUDE_ABBREV_COLLOCS = True
    params = train_data.get_params()
    sbd = PunktSentenceTokenizer(params)
    tokenenized_sentences = []
    for sentence in sbd.sentences_from_text(sentences_string, realign_boundaries=True):
        tokenenized_sentences.append(sentence)
    #print(tokenenized_sentences)
    return tokenenized_sentences
Example #12
0
def add_sents(invid=None):
    if invid:
        findObj = {"_id": invid}
    else:
        findObj = {}
    for vd in vdigests.find(findObj):
        if not vd.get("nSentences") and vd.get('alignTrans') and vd.get(
                'alignTrans').get('words'):
            twords = vd['alignTrans']['words']
            twords_len = len(twords)
            trans = " ".join([wrd["word"] for wrd in twords])
            STokenizer = PunktSentenceTokenizer()
            token_sents = STokenizer.sentences_from_text(trans)
            cwct = 0
            sentct = 0
            curword = twords[cwct]
            for tsent in token_sents:
                tswords = tsent.split(" ")
                for wnum, tsword in enumerate(tswords):
                    if tsword == curword["word"]:
                        curword["sentenceNumber"] = sentct
                        cwct += 1
                        if cwct < twords_len:
                            curword = twords[cwct]
                    else:
                        print "warning: not a one-to-one match: ", curword[
                            "word"], tsword
                        if wnum == 0:
                            curword["sentenceNumber"] = sentct - 1
                            cwct += 1
                            if cwct < twords_len:
                                curword = twords[cwct]
                        elif wnum == len(tswords) - 1:
                            curword["sentenceNumber"] = sentct
                        else:
                            ipdb.set_trace()
                sentct += 1
            vd['nSentences'] = len(token_sents)
            # write the separated sentences to file
            ssout_name = "ss-" + vd["_id"]
            outf = open("../ffdata/rawtrans/" + ssout_name, 'w')
            outf.write("\n".join(token_sents))
            outf.close()
            vd['sentSepTransName'] = ssout_name
            vdigests.save(vd)
def raw_records(crawl_collection, start):

    # Prepare a naive sentence tokeniser utility
    pst = PunktSentenceTokenizer()

    for rec in crawl_collection.query({'downloaded': True},
                                      field=None,
                                      skip=start):
        _id = rec['_id']
        if rec['content'] is None:
            continue
        content = rec['content']['contents']
        # A wiki page may probably comprise of multiple content
        for c in content:
            # Explode a long topic into list of sentences
            sentences = pst.sentences_from_text(c)
            for s in sentences:
                yield (_id, s)
Example #14
0
def add_sents(invid=None):
    if invid:
        findObj = {"_id": invid}
    else:
        findObj = {}
    for vd in vdigests.find(findObj):
        if not vd.get("nSentences") and vd.get('alignTrans') and vd.get('alignTrans').get('words'):
            twords = vd['alignTrans']['words']
            twords_len = len(twords)
            trans = " ".join([wrd["word"] for wrd in twords])
            STokenizer = PunktSentenceTokenizer()
            token_sents = STokenizer.sentences_from_text(trans)
            cwct = 0
            sentct = 0
            curword = twords[cwct]
            for tsent in token_sents:
                tswords = tsent.split(" ")
                for wnum, tsword in enumerate(tswords):
                    if tsword == curword["word"]:
                        curword["sentenceNumber"] = sentct
                        cwct += 1
                        if cwct < twords_len:
                            curword = twords[cwct]
                    else:
                        print "warning: not a one-to-one match: ", curword["word"], tsword
                        if wnum == 0:
                            curword["sentenceNumber"] = sentct - 1
                            cwct += 1
                            if cwct < twords_len:
                                curword = twords[cwct]
                        elif wnum == len(tswords) - 1:
                            curword["sentenceNumber"] = sentct
                        else:
                            ipdb.set_trace()
                sentct += 1
            vd['nSentences'] = len(token_sents)
            # write the separated sentences to file
            ssout_name = "ss-" + vd["_id"]
            outf = open("../ffdata/rawtrans/" + ssout_name, 'w')
            outf.write("\n".join(token_sents))
            outf.close()
            vd['sentSepTransName'] = ssout_name
            vdigests.save(vd)
Example #15
0
def tokenize_greek_sentences(sentences_string):
    global tokenenized_sentences
    pickle_name = 'greek.pickle'
    pickle_path = os.path.join(cltk_data, 'compiled', 'sentence_tokens_greek/', pickle_name)
    with open(pickle_path, 'rb') as f:
        train_data = pickle.load(f)
    train_data.INCLUDE_ALL_COLLOCS = True
    train_data.INCLUDE_ABBREV_COLLOCS = True
    params = train_data.get_params()
    sbd = PunktSentenceTokenizer(params)
    '''
    with open(input_file) as f:
        to_be_tokenized = f.read()
    '''
    tokenenized_sentences = []
    for sentence in sbd.sentences_from_text(sentences_string, realign_boundaries=True):
        tokenenized_sentences.append(sentence)
    #print(tokenenized_sentences)
    return tokenenized_sentences
Example #16
0
def chunk_article(article):
    """
    Given a long string, article, representing the full text of a given article,
    convert the string into a list of sentences
    :param article: A string representing the full text of an article
    :return: A list of strings representing the sentences of the article
    """

    # Add support to NOT falsely split a sentence at a title like dr or mr
    p_params = PunktParameters()
    p_params.abbrev_types = set(ABBREV_TYPES)
    p = PunktSentenceTokenizer(p_params)

    sen = p.sentences_from_text(article, realign_boundaries=False)

    # Strip extra spaces
    for s in sen:
        s.strip()

    return sen
Example #17
0
def export_crawl_to_text(mineDB):

    # Prepare a naive sentence tokeniser utility
    pst = PunktSentenceTokenizer()

    text_path = os.path.realpath('./mine.txt')

    with codecs.open(text_path, 'w', 'utf-8') as f:
        m = 0
        for wiki in mineDB.query({'downloaded': True}, field=None):

            # Skip empty content or the added one
            if wiki['content'] is None or 'added_to_graph' in wiki:
                continue

            content = wiki['content']

            # A wiki page may probably comprise of multiple content
            for c in content['contents']:
                # Explode content into sentences
                sentences = pst.sentences_from_text(c)
                print('... content #{} ==> {} sentences extracted.'.format(
                    m, len(sentences)))

                for s in sentences:
                    # Cleanse the sentence
                    s_ = cleanse(s)
                    # Filter out noise by length
                    if len(s_) < 5 or len(s_.split(' ')) < 3:
                        continue
                    f.write(s_.lower() + '\n')

            m += 1

            if m >= args['limit']:
                print(
                    colored('[Ending] Maximum number of topics reached.',
                            'yellow'))
                break

    return text_path
Example #18
0
def iter_topic(crawl_collection, start):

    # Prepare a naive sentence tokeniser utility
    pst = PunktSentenceTokenizer()

    n = 0

    for wiki in crawl_collection.query({'downloaded': True},
                                       field=None,
                                       skip=start):

        # Skip empty content or the added one
        if wiki['content'] is None or 'added_to_graph' in wiki:
            continue

        m = 0
        content = wiki['content']

        if args['verbose']:
            print(colored('[Extracting wiki] : ', 'cyan'), content['title'])

        # A wiki page may probably comprise of multiple content
        for c in content['contents']:
            # Explode a long topic into list of sentences
            sentences = pst.sentences_from_text(c)
            for s in sentences:
                m += 1
                yield (content['title'], s.split(' '))

        # After all sentences are processed,
        # mark the current wiki record as 'processed'
        crit = {'_id': wiki['_id']}
        crawl_collection.update(crit, {'$set': {'added_to_graph': True}})

        n += 1
        if args['verbose']:
            print(content['title'] + " processed with {0} nodes.".format(m))
            print(
                colored("{0} wiki documents processed so far...".format(n),
                        'blue'))
Example #19
0
class GaleChurchAlignmentDistance(DistanceScorer):
    def __init__(self):
        self.name = "Gale Church Alignment Scorer"
        self.tokenizer = PunktSentenceTokenizer()
        self.sblocks, self.tblocks = [], []

    def _blocks_from_text(self, text):
        blocks = []
        for sentence in self.tokenizer.sentences_from_text(
                text.replace('\n', '')):
            blocks.append(len(sentence))
            # maybe count tokens? or non-spaces?
        return blocks

    def _extract(self, source_corpus, target_corpus):
        for url, page in source_corpus.iteritems():
            self.sblocks.append(self._blocks_from_text(page.text))
        for url, page in target_corpus.iteritems():
            self.tblocks.append(self._blocks_from_text(page.text))

    def _score_pair(self, s_idx, s_page, t_idx, t_page):
        return self.gc.align_score(self.sblocks[s_idx], self.tblocks[t_idx])
def export_crawl_to_text(mineDB):
  
  # Prepare a naive sentence tokeniser utility
  pst = PunktSentenceTokenizer()

  text_path = os.path.realpath('./mine.txt')

  with codecs.open(text_path, 'w', 'utf-8') as f:
    m = 0
    for wiki in mineDB.query({'downloaded': True},field=None):
      
      # Skip empty content or the added one
      if wiki['content'] is None or 'added_to_graph' in wiki:
        continue

      content = wiki['content']

      # A wiki page may probably comprise of multiple content
      for c in content['contents']:
        # Explode content into sentences
        sentences = pst.sentences_from_text(c)
        print('... content #{} ==> {} sentences extracted.'.format(m, len(sentences)))

        for s in sentences:
          # Cleanse the sentence
          s_ = cleanse(s)
          # Filter out noise by length
          if len(s_)<5 or len(s_.split(' '))<3:
            continue
          f.write(s_.lower() + '\n')

      m += 1

      if m>=args['limit']:
        print(colored('[Ending] Maximum number of topics reached.','yellow'))
        break

  return text_path
Example #21
0
class GaleChurchAlignmentDistance(DistanceScorer):

    def __init__(self):
        self.name = "Gale Church Alignment Scorer"
        self.tokenizer = PunktSentenceTokenizer()
        self.sblocks, self.tblocks = [], []

    def _blocks_from_text(self, text):
        blocks = []
        for sentence in self.tokenizer.sentences_from_text(
                text.replace('\n', '')):
            blocks.append(len(sentence))
            # maybe count tokens? or non-spaces?
        return blocks

    def _extract(self, source_corpus, target_corpus):
        for url, page in source_corpus.iteritems():
            self.sblocks.append(self._blocks_from_text(page.text))
        for url, page in target_corpus.iteritems():
            self.tblocks.append(self._blocks_from_text(page.text))

    def _score_pair(self, s_idx, s_page, t_idx, t_page):
        return self.gc.align_score(self.sblocks[s_idx], self.tblocks[t_idx])
def iter_topic(crawl_collection,start):
  
  # Prepare a naive sentence tokeniser utility
  pst = PunktSentenceTokenizer()
  
  n = 0
  
  for wiki in crawl_collection.query({'downloaded': True},field=None,skip=start):
    
    # Skip empty content or the added one
    if wiki['content'] is None or 'added_to_graph' in wiki:
      continue

    m = 0
    content = wiki['content']
    
    if args['verbose']:
      print(colored('[Extracting wiki] : ','cyan'), content['title'])
    
    # A wiki page may probably comprise of multiple content
    for c in content['contents']:
      # Explode a long topic into list of sentences
      sentences = pst.sentences_from_text(c)
      for s in sentences:
        m += 1
        yield (content['title'],s.split(' '))

    # After all sentences are processed,
    # mark the current wiki record as 'processed'
    crit = {'_id': wiki['_id']}
    crawl_collection.update(crit, {'$set':{'added_to_graph':True}})

    n += 1
    if args['verbose']:
      print(content['title'] + " processed with {0} nodes.".format(m))
      print(colored("{0} wiki documents processed so far...".format(n),'blue'))
Example #23
0
class SimhashDistance(DistanceScorer):
    CHAR, TOKEN = range(2)

    def __init__(self, source_tokenizer, target_tokenizer, n=2, level=TOKEN):
        self.name = "Simhash Distance Scorer, n=%d" % n
        self.sentence_splitter = PunktSentenceTokenizer()
        self.s_hashes, self.t_hashes = [], []

        self.source_tokenizer = source_tokenizer
        if not source_tokenizer:
            self.source_tokenizer = SpaceTokenizer()

        self.target_tokenizer = target_tokenizer
        if not target_tokenizer:
            self.target_tokenizer = SpaceTokenizer()

        def ngrams(n, tokenizer, page):
            result = []
            text = page.text.replace('\n', '')
            for sentence in self.sentence_splitter.sentences_from_text(text):
                if not sentence.strip():
                    continue
                # if '\n' in sentence:
                #     print repr(sentence)
                assert '\n' not in sentence, sentence
                words = tokenizer.process(sentence).strip().split()
                result += [
                    " ".join(words[i:i + n])
                    for i in range(max(len(words) - n + 1, 1))
                ]
            return result

        def tokens(n, tokenizer, page):
            # 180/1grams
            # words = page.html.split()
            words = filter(None, re.split("[^0-9a-zA-Z]", page.text))
            return [
                " ".join(words[i:i + n])
                for i in range(max(len(words) - n + 1, 1))
            ]

        def chars(n, tokenizer, page):
            s = "".join(page.text.split())
            return [
                " ".join(s[i:i + n]) for i in range(max(len(s) - n + 1, 1))
            ]

        def html_tokens(n, tokenizer, page):
            # 153/trigrams
            words = page.html.split()
            return [
                " ".join(words[i:i + n])
                for i in range(max(len(words) - n + 1, 1))
            ]

        if level == SimhashDistance.TOKEN:
            self.source_features = partial(tokens, n, self.source_tokenizer)
            self.target_features = partial(tokens, n, self.target_tokenizer)
        elif level == SimhashDistance.CHARS:
            self.source_features = partial(chars, n, self.source_tokenizer)
            self.target_features = partial(chars, n, self.target_tokenizer)
        # self.source_features = partial(ngrams, n, self.source_tokenizer)
        # self.target_features = partial(ngrams, n, self.target_tokenizer)
        # print self.source_features("How are you?\nI am fine. Thanks.")

    def _words_from_text(self, text, tokenizer):
        words = set()
        for line in self.sentence_splitter(text):
            for w in tokenizer.process(line).split("\n"):
                words.add(w)
        return words

    def _extract(self, source_corpus, target_corpus):
        for url, page in source_corpus.iteritems():
            self.s_hashes.append(Simhash(self.source_features(page)))
        for url, page in target_corpus.iteritems():
            self.t_hashes.append(Simhash(self.target_features(page)))

    def _score_pair(self, s_idx, s_page, t_idx, t_page):
        return -self.s_hashes[s_idx].distance(self.t_hashes[t_idx])

    def get_features(self, text):
        width = 3
        text = self.tokenizer.sentences_from_text(text)
        return [
            text[i:i + width] for i in range(max(len(text) - width + 1, 1))
        ]
Example #24
0
class SimhashDistance(DistanceScorer):
    CHAR, TOKEN = range(2)

    def __init__(self, source_tokenizer, target_tokenizer, n=2, level=TOKEN):
        self.name = "Simhash Distance Scorer, n=%d" % n
        self.sentence_splitter = PunktSentenceTokenizer()
        self.s_hashes, self.t_hashes = [], []

        self.source_tokenizer = source_tokenizer
        if not source_tokenizer:
            self.source_tokenizer = SpaceTokenizer()

        self.target_tokenizer = target_tokenizer
        if not target_tokenizer:
            self.target_tokenizer = SpaceTokenizer()

        def ngrams(n, tokenizer, page):
            result = []
            text = page.text.replace('\n', '')
            for sentence in self.sentence_splitter.sentences_from_text(text):
                if not sentence.strip():
                    continue
                # if '\n' in sentence:
                #     print repr(sentence)
                assert '\n' not in sentence, sentence
                words = tokenizer.process(sentence).strip().split()
                result += [" ".join(words[i:i + n]) for i in
                           range(max(len(words) - n + 1, 1))]
            return result

        def tokens(n, tokenizer, page):
            # 180/1grams
            # words = page.html.split()
            words = filter(None, re.split("[^0-9a-zA-Z]", page.text))
            return [" ".join(words[i:i + n]) for i in
                    range(max(len(words) - n + 1, 1))]

        def chars(n, tokenizer, page):
            s = "".join(page.text.split())
            return [" ".join(s[i:i + n]) for i in
                    range(max(len(s) - n + 1, 1))]

        def html_tokens(n, tokenizer, page):
            # 153/trigrams
            words = page.html.split()
            return [" ".join(words[i:i + n]) for i in
                    range(max(len(words) - n + 1, 1))]

        if level == SimhashDistance.TOKEN:
            self.source_features = partial(tokens, n, self.source_tokenizer)
            self.target_features = partial(tokens, n, self.target_tokenizer)
        elif level == SimhashDistance.CHARS:
            self.source_features = partial(chars, n, self.source_tokenizer)
            self.target_features = partial(chars, n, self.target_tokenizer)
        # self.source_features = partial(ngrams, n, self.source_tokenizer)
        # self.target_features = partial(ngrams, n, self.target_tokenizer)
        # print self.source_features("How are you?\nI am fine. Thanks.")

    def _words_from_text(self, text, tokenizer):
        words = set()
        for line in self.sentence_splitter(text):
            for w in tokenizer.process(line).split("\n"):
                words.add(w)
        return words

    def _extract(self, source_corpus, target_corpus):
        for url, page in source_corpus.iteritems():
            self.s_hashes.append(Simhash(self.source_features(page)))
        for url, page in target_corpus.iteritems():
            self.t_hashes.append(Simhash(self.target_features(page)))

    def _score_pair(self, s_idx, s_page, t_idx, t_page):
        return -self.s_hashes[s_idx].distance(self.t_hashes[t_idx])

    def get_features(self, text):
        width = 3
        text = self.tokenizer.sentences_from_text(text)
        return [text[i:i + width] for i in
                range(max(len(text) - width + 1, 1))]
Example #25
0
ref_list = " ".join(ref)

line = sent_detector.tokenize(ref_list.strip())


author_name= []
year_of_pub= []
paper_name=[]
journal_name=[]

year_found = False
req_idx = 1
for i in Reference:
    line = sent_detector.tokenize(i.strip())
    line2 = sent_detector.sentences_from_text(i.strip() )
    References.append(line)
    line3 = [x for x in line if x != "."]
    if len(line3)==4:
        j=0        
        author_name.append(line3[j])
        year_of_pub.append(line3[j+1])
        paper_name.append(line3[j+2])
        journal_name.append(line3[j+3])
    else:
        name_str = []
        regex = re.compile("(\d{4})")
        idx=0
        req_idx = 1
        while(idx<len(line3)):
            result = re.findall(regex,line3[idx])
Example #26
0
class kbTokenizer:
    '''Tokenizer used to pre-process KB dataset for generating Word2Vec models
    from word2vecModels/*.w2v. '''
    def __init__(self, bLowerCase=True):
        self.bLowerCase = bLowerCase

        self.oPunktSentTokenizer = PunktSentenceTokenizer()

        self.sNonTokenChars = (u"[‘’“”…”’“–«»\,‘\]\[;:\-\"'\?!¡¢∞§¶•ª≠∑´®†¨^π"
                               "ƒ©˙∆˚¬≈√∫~⁄™‹›fifl‡°·±—‚„‰∏”`◊ˆ~¯˘¿÷\*\(\)<>="
                               "\+#^\\\/_]+")
        self.reNonTokenChars_start = \
            re.compile(u"(\A|\s)%s" % self.sNonTokenChars, re.U)
        self.reNonTokenChars_end = \
            re.compile(u"%s(\.?(\s|\Z))" % self.sNonTokenChars, re.U)
        self.reWhitespace = re.compile("\W+", re.U)

    def removeNonTokenChars(self, sString):
        sString = re.sub(self.reNonTokenChars_start, '\g<1>', sString)
        return re.sub(self.reNonTokenChars_end, '\g<1>', sString)

    def tokenizeSentence(self, sString):
        aTokens = None
        if self.bLowerCase:
            aTokens = self.reWhitespace.split(
                self.removeNonTokenChars(sString.lower()))
        else:
            aTokens = self.reWhitespace.split(
                self.removeNonTokenChars(sString))

        # split() gives empty first/last elements if there were separators at
        # the start/end of the string (so whitespace, in this case).
        # We correct for that.
        iStart = 1 if aTokens[0] == '' else 0
        if aTokens[-1] == '':
            return aTokens[iStart:-1]
        else:
            return aTokens[iStart:]

    def tokenizeText(self, sText):
        '''
        Input is a utf8 text.
        Output is a list of lists of tokens. One list of tokens per sentence.
        '''
        aTextTokens = []
        for sSentence in self.oPunktSentTokenizer.sentences_from_text(sText):
            aTokens = self.tokenizeSentence(sSentence)

            if len(aTokens) > 0:
                aTextTokens.append(aTokens)

        return aTextTokens

    def tokenizeFile(self, sFile):
        try:
            fhInput = codecs.open(sFile, mode='r', encoding='utf8')
        except IOError, oError:
            print >> sys.stderr, "[ERROR] Error while opening '%s'" % sFile
            print >> sys.stderr, "[ERROR] '%s'" % oError
            exit(1)

        sText = fhInput.read()
        fhInput.close()

        return self.tokenizeText(sText)
Example #27
0
def annotate_text(raw_data_folder,
                  labels_data_folder,
                  file_to_write,
                  max_sent_len=35,
                  improved_sent_splitting=True,
                  training=True):
    """
    Creates a token-level input file for the span identification task and adds
    sentence IDs to the tokens.
    """
    # max_sent_len = -1 ==> no sentence splitting
    if max_sent_len == -1:
        # the corresponding if-block can handle this
        improved_sent_splitting = True
    nlp = English()
    tokenizer = nlp.Defaults.create_tokenizer(nlp)
    if improved_sent_splitting:
        punkt_param = PunktParameters()
        punkt_param.abbrev_types = set([
            'dr', 'vs', 'mr', 'mrs', 'prof', 'inc', 'ms', 'rep', 'u.s', 'feb',
            'sen'
        ])
        splitter = PunktSentenceTokenizer(punkt_param)
        splitter.PUNCTUATION = tuple(';:,.!?"')
    output_table = []
    file_counter = 0
    sent_no_total = 0

    print("Total number of files - {}".format(len(
        os.listdir(raw_data_folder))))

    # Reading all the files from the raw text directory
    article_file_names = [
        file_name for file_name in os.listdir(raw_data_folder)
        if file_name.endswith(".txt")
    ]
    article_file_names.sort()

    for file_name in article_file_names:
        if training:
            label_file_name = file_name.replace(".txt", ".task2-TC.labels")
            print("raw_article: {}\tlabel_file: {}".format(
                file_name, label_file_name))

            # Read the labels file with 4 columns of format
            # doc_id : label_of_span : idx_span_begin : idx_span_end
            with open(os.path.join(labels_data_folder, label_file_name),
                      encoding="utf-8") as file:
                rows = file.readlines()
                rows = [
                    row.strip().split("\t") for row in rows
                    if len(row.split("\t")) == 4
                ]

                # Saving mappings char_idx->labels into the dictionary
                char_idx2label = dict()
                for row in rows:
                    label = row[1]
                    idx_from = int(row[2])
                    idx_to = int(row[3])

                    for idx in range(idx_from, idx_to):
                        if idx not in char_idx2label.keys():
                            char_idx2label[idx] = []
                        char_idx2label[idx].append(label)
        else:
            print("raw_article: " + file_name)

        # Read the article and process the text
        with open(os.path.join(raw_data_folder, file_name),
                  encoding="utf-8") as file:
            file_text = file.readlines()
            # Keep linebreaks for better sentence splitting
            file_text = ''.join([line for line in file_text])

            # Normalizing punctuation marks to help the tokenizer.
            file_text = file_text.replace('“', '"').replace('”', '"')
            file_text = file_text.replace("’", "'").replace("‘", "'")

            sentences = []
            if improved_sent_splitting:
                # Line breaks -> helps with headlines
                paragraphs = file_text.split('\n')
                for para in paragraphs:
                    para = para.strip()
                    sentences_raw = splitter.sentences_from_text(para)
                    for sent in sentences_raw:
                        sent = sent.strip()
                        tokens = tokenizer(sent)
                        if len(tokens) <= max_sent_len or max_sent_len == -1:
                            # No need to split the sentence!
                            if len(sent) == 0:
                                # Can happen when paragraphs are separated by
                                # several line breaks.
                                continue
                            sentences.append(sent)
                            continue

                        # Try splitting based on quotes.
                        quote_fragments, all_ok = punct_based_split_sent(
                            tokenizer, sent, max_sent_len, '"')
                        if all_ok:
                            sentences += quote_fragments
                            continue

                        # Other punctuation for splitting: ; :
                        for quote_frag in quote_fragments:
                            semicolon_fragments, all_ok =\
                                punct_based_split_sent(tokenizer, quote_frag,
                                                       max_sent_len, ';')
                            if all_ok:
                                sentences += semicolon_fragments
                                continue

                            for semicolon_frag in semicolon_fragments:
                                colon_fragments, all_ok =\
                                    punct_based_split_sent(tokenizer,
                                                           semicolon_frag,
                                                           max_sent_len, ':')
                                if all_ok:
                                    sentences += colon_fragments
                                    continue

                                # Commas:
                                for col_frag in colon_fragments:
                                    comma_fragments, all_ok =\
                                        punct_based_split_sent(tokenizer,
                                                               col_frag,
                                                               max_sent_len,
                                                               ',')
                                    if all_ok:
                                        sentences += comma_fragments
                                        continue

                                    # Last resort:
                                    # Split after max_sent_len tokens
                                    for comma_frag in comma_fragments:
                                        sentences += forcefully_split_sent(
                                            tokenizer, comma_frag,
                                            max_sent_len)
            else:
                # Cut long sentences into fragments that are (up to)
                # max_sent_len characters long
                # (the last fragment in a sentence might be shorter)
                file_text = file_text.replace('\n', ' ')
                sentences_raw = sent_tokenize(file_text)
                for sent in sentences_raw:
                    sentences += forcefully_split_sent(tokenizer, sent,
                                                       max_sent_len)

            i = 0
            for sent in sentences:
                sent = sent.strip()
                i = file_text.find(sent, i)
                max_idx = i + len(sent)

                if sent == '':
                    continue

                if improved_sent_splitting:
                    if len(sent.strip()) < 2:  # single char noise
                        continue

                sent_no_total += 1
                for token in tokenizer(sent):
                    token = str(token)
                    token_idx = file_text.find(token, i, max_idx)
                    i = token_idx + len(token)
                    output = [
                        file_name.replace("article", "").replace(".txt", ""),
                        str(sent_no_total),
                        str(token_idx),
                        str(i), token
                    ]
                    if training:
                        # Check the label of the corresponding char_idx
                        label = char_idx2label.get(token_idx, ['None'])
                        output.append("|".join(label))
                    output_table.append(output)

        file_counter += 1
        print("Finished {} files\n".format(file_counter))

        with open(file_to_write, 'w', encoding="utf-8") as f:
            f.write('# max_sent_len=' + str(max_sent_len) +
                    ', improved_sent_splitting=' +
                    str(improved_sent_splitting) + '\n')
            f.write('document_id\tsent_id\ttoken_start\ttoken_end\ttoken')
            if training:
                f.write('\tlabel')
            f.write('\n')
            for row in output_table:
                f.write('\t'.join(row) + "\n")