Beispiel #1
0
class tokenizer:
    def __init__(self):
        self.stanford_tokenizer = \
        StanfordTokenizer('../stanford-parser-2010-08-20/stanford-parser.jar'\
                         ,options={"americanize": False});
        pass;

    #tokenize with stanford_parser
    def stanford_tokenize(self,row):
        temp_list = self.stanford_tokenizer.tokenize(row);
        return temp_list;
    
    #tokenize with nltk word_tokenizer
    def word_tokenize(self,row):
        temp_list = nltk.word_tokenize(row);
        list_length = len(temp_list);
        index_list = list();
        for i in xrange(list_length):
            if temp_list[i].startswith('\''):
                if len(temp_list[i]) > 3:
                    temp_list[i] = temp_list[i][1:];
                    index_list.append(i);
        #end for
        count = 0;
        for index in index_list:
            temp_list.insert(index+count,'\'');
            count+=1;
        #end for
        return temp_list;

    def no_block(self ,string):
        string = re.sub(r' ','',string);
        return len(string);
Beispiel #2
0
    def __init__(self, sentence):
        en_parser = StanfordParser(
            path_to_jar=
            '../stanford-parser-full-2018-02-27/stanford-parser.jar',
            path_to_models_jar=
            '../stanford-parser-full-2018-02-27/stanford-parser-3.9.1-models.jar',
            model_path=
            '../stanford-parser-full-2018-02-27/stanford-parser-3.9.1-models/edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz'
        )
        sg = StanfordTokenizer(
            path_to_jar='../stanford-parser-full-2018-02-27/stanford-parser.jar'
        )
        self.status = 0
        self.trans = googletrans.Translator()

        self.sentence = sentence.strip("\n").replace(" ", "")

        en_trans = self.trans.translate(sentence).text
        en_trans = sg.tokenize(en_trans)
        try:
            tree = list(en_parser.parse(en_trans))
            self.tree = tree[0]
            # print(self.tree)
            self.rel = []
        except:
            self.status = 1
Beispiel #3
0
class tokenizer:
    def __init__(self):
        self.stanford_tokenizer = \
        StanfordTokenizer('../stanford-parser-2010-08-20/stanford-parser.jar'\
                         ,options={"americanize": False})
        pass

    #tokenize with stanford_parser
    def stanford_tokenize(self, row):
        temp_list = self.stanford_tokenizer.tokenize(row)
        return temp_list

    #tokenize with nltk word_tokenizer
    def word_tokenize(self, row):
        temp_list = nltk.word_tokenize(row)
        list_length = len(temp_list)
        index_list = list()
        for i in xrange(list_length):
            if temp_list[i].startswith('\''):
                if len(temp_list[i]) > 3:
                    temp_list[i] = temp_list[i][1:]
                    index_list.append(i)
        #end for
        count = 0
        for index in index_list:
            temp_list.insert(index + count, '\'')
            count += 1
        #end for
        return temp_list

    def no_block(self, string):
        string = re.sub(r' ', '', string)
        return len(string)
Beispiel #4
0
def clean_content(page, verbose=True):
    content = page.content
    tag_name = ''
    ret_content = []
    for line in content.splitlines():
        match = re.match('=+ +(.+)? +=+', line)
        # タグかどうか,そうでなければリストに追加
        if not match:
            if len(line) and not tag_name in clean_content.omit_sections:
                ret_content.append(line)
            continue
        # タグ名更新
        tag_name = match.group(1)
        match = re.match('(.+)?Edit', tag_name)
        if match: tag_name = match.group(1)
    # 文頭大文字  文末ピリオド
    st = StanfordTokenizer()
    ret_tokens = []
    for idx, line in enumerate(ret_content):
        if verbose:
            sys.stdout.write('\rParsing "%s" %d / %d'%(page.title, idx+1, len(ret_content)))
            sys.stdout.flush()
        tokens = st.tokenize(line)
        indices = [0]+[i+1 for i, e in enumerate(tokens) if e in ['.','!','?']]
        subtokens = [tokens[indices[i]:indices[i+1]] for i in range(len(indices)-1)]
        ret_tokens.extend(filter(lambda tokens: tokens[0][0].isupper(), subtokens))
    if verbose: sys.stdout.write('\n'); sys.stdout.flush()
    return '\n'.join([' '.join(line) for line in ret_tokens])
Beispiel #5
0
def tokenize(content):
    """Breaks up text-based content into tokens in the style of PTB corpus"""
    _path_to_jar = os.path.abspath(
        'summarize/stanford-postagger/stanford-postagger.jar')
    token_list = []
    st = StanfordTokenizer(path_to_jar=_path_to_jar)
    content = content.lower()
    token_list = st.tokenize(content)
    return token_list
def tokenize_stopwords_stemmer(texts):
    
    #用斯坦福的分词采用这一段,用普通分词时不用这个
    #tokenize
    Str_texts=texts[0]
    #tokenizer = StanfordTokenizer(path_to_jar=r"/Users/apple/Documents/tools/stanford-parser-full-2015-04-20/stanford-parser.jar")
    tokenizer = StanfordTokenizer(path_to_jar=r"stanford-parser.jar") #path_to_jar: 用来定位jar包,r是防止字符转义的,如果路径中出现'\t'的话 不加r的话\t就会被转义 而加了'r'之后'\t'就能保留原有的样子
    java_path = 'C:/Program Files/Java/jdk1.8.0_121/bin/java.exe'
    os.environ['JAVAHOME'] = java_path
    texts_tokenized=tokenizer.tokenize(Str_texts)#输入必须是字符串,进行分词
    #print(texts_tokenized)

    p1=r'[-@<#$%^&*].+'
    pa1=re.compile(p1)  #re.compile()函数,将正则表达式的字符串形式编译为Pattern实例,然后使用Pattern实例处理文本并获得匹配结果(一个Match实例)
    texts_filtered0 = [ document for document in  texts_tokenized  if not document in pa1.findall(document) ]
    
    p2=r'.+[-_\/].+'  #将r'.+[-_\./].+'改为r'.+[-_\/].+',可以保留数字间的句号,比如保留3.1.2这样的格式
    pa2=re.compile(p2)
    texts_filtered=[]
    for document in  texts_filtered0:
        if document in pa2.findall(document):
            if document.find('_')>-1 : #split():拆分字符串。通过指定分隔符对字符串进行切片,并返回分割后的字符串列表(list)
                texts_filtered = texts_filtered + document.split('_')
            elif document.find('-')>-1:
                texts_filtered = texts_filtered + document.split('-')
            elif document.find('.')>-1:
                texts_filtered = texts_filtered + document.split('.')
            elif document.find('/')>-1:
                texts_filtered = texts_filtered + document.split('/')
        else:
            texts_filtered.append(document)
    
    texts_filtered = [ document for document in  texts_filtered  if  document != '' and document != "''" and document != "``" ]#过滤掉空格,单引号和--
  
    #stopwords
    english_stopwords = stopwords.words('english')#得到停词
    texts_filtered_stopwords = [ document for document in texts_filtered if not document in english_stopwords]#过滤掉停词
    
    english_punctuations = [',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%','\n'
                            ,'<','>','/','\"','\'','{','}','!','~','`'
                            ,'$','^','/*','*/','/**','**/','**','-','_','+','=',r'-?-',r'@?']#得到标点

    texts_filtered = [ document for document in  texts_filtered_stopwords if not document in english_punctuations]#过滤掉标点
    #print texts_filtered
    temp = texts_filtered[:]  #实现去除带'comment'元素的代码
    for i in temp:
        if 'comment' in i:
            texts_filtered.remove(i)
    #print(texts_filtered)
    #texts_filtered=[re.sub(r'^[1-9]\d*$'.format(punctuation), '', x) for x in texts_filtered]  # ^[1-9]\d*$过滤掉整数

    porter = nltk.PorterStemmer() #词干提取算法
    texts_Stemmered=[porter.stem(t) for t in texts_filtered] #列表类型,提取词干
    return texts_Stemmered #返回一个列表
Beispiel #7
0
def tokenize_q(qa, phase):
    qas = len(qa)
    MyTokenizer = StanfordTokenizer()
    for i, row in enumerate((qa)):
        row['question_toked'] = MyTokenizer.tokenize(
            row['question'].lower())[:14]
        if i % 50000 == 0:
            json.dump(qa,
                      open('vqa_' + phase + '_toked_' + str(i) + '.json', 'w'))
        if i == qas - 1:
            json.dump(qa, open('vqa_' + phase + '_toked.json', 'w'))
Beispiel #8
0
class Tokenizer(object):
    """
    Tokenize sentence
    """
    def __init__(self, jar_path):
        self.tokenizer = StanfordTokenizer(jar_path)

    def tokenize(self, sentence):
        return self.tokenizer.tokenize(sentence)

    def __call__(self, sentence):
        return self.tokenize(sentence)
def spans(txt):
    english_tokenizer = StanfordTokenizer(
        'C:/Users/dongfangxu9/PycharmProjects/pos_tagger/stanford-postagger.jar',
        options={
            "americanize": True,
        },
        java_options='-mx1000m')
    tokens = english_tokenizer.tokenize(txt)
    offset = 0
    for token in tokens:
        offset = txt.find(token, offset)
        yield token, offset, offset + len(token)
        offset += len(token)
Beispiel #10
0
class Preprocessor:
    def __init__(self):
        self.dm_single_close_quote = u'\u2019'  # unicode
        self.dm_double_close_quote = u'\u201d'
        self.END_TOKENS = [
            '.', '!', '?', '...', "'", "`", '"', self.dm_single_close_quote,
            self.dm_double_close_quote, ")"
        ]  # acceptable ways to end a sentence

        # We use these to separate the summary sentences in the .bin datafiles
        self.SENTENCE_START = '<s>'
        self.SENTENCE_END = '</s>'
        self.tokenizer = StanfordTokenizer('stanford-postagger.jar',
                                           options={"tokenizeNLs": True})

    def tokenize(self, article):
        return self.tokenizer.tokenize(article)

    def fix_missing_period(self, line):
        """Adds a period to a line that is missing a period"""

        if line == "": return line
        if line[-1] in self.END_TOKENS: return line
        # print line[-1]
        return line + " ."

    def adjust_article(self, article):
        #takes the article t

        # Lowercase everything
        lines = [line.lower() for line in article]

        # Put periods on the ends of lines that are missing them (this is a problem in the dataset because many image captions don't end in periods; consequently they end up in the body of the article as run-on sentences)
        lines = [self.fix_missing_period(line) for line in lines]

        # Separate out article and abstract sentences
        article_lines = []

        for idx, line in enumerate(lines):
            if line == "":
                continue  # empty line
            else:
                article_lines.append(line)

        # Make article into a single string
        article = ' '.join(article_lines)

        # # Make abstract into a signle string, putting <s> and </s> tags around the sentences
        # abstract = ' '.join(["%s %s %s" % (self.SENTENCE_START, sent, self.SENTENCE_END) for sent in highlights])

        return article
Beispiel #11
0
def stanfordTokenizer ( rawText ):
  """
  Uses Stanford University's natural language processing lab
    tokenizer to split raw text.
  """

  jarPath = "/Users/Nathan/nltk_data/stanford-postagger.jar"
  stanfordOptions = {
    "americanize": True,
    "ptb3Escaping": False
  }

  stanfordTokenizer = StanfordTokenizer( jarPath, 'UTF-8', stanfordOptions )

  return stanfordTokenizer.tokenize( rawText )
Beispiel #12
0
    def __init__(self, sentence):

        en_parser = StanfordParser(path_to_jar='../stanford-corenlp-full-2018-02-27/stanford-corenlp-3.9.1.jar',
                                   path_to_models_jar='../stanford-corenlp-full-2018-02-27/stanford-corenlp-3.9.1-models.jar',
                                   )
        sg = StanfordTokenizer(path_to_jar='../stanford-corenlp-full-2018-02-27/stanford-corenlp-3.9.1.jar')
        self.trans = googletrans.Translator()

        self.sentence = sentence

        result1 = sg.tokenize(self.trans.translate(sentence).text)

        tree = list(en_parser.parse(result1))
        self.tree = tree[0]
        self.rel=[]
Beispiel #13
0
class Preprocessor:
    def __init__(self):
        self.dm_single_close_quote = u'\u2019'  # unicode
        self.dm_double_close_quote = u'\u201d'
        self.END_TOKENS = [
            '.', '!', '?', '...', "'", "`", '"', self.dm_single_close_quote,
            self.dm_double_close_quote, ")"
        ]  # acceptable ways to end a sentence

        self.tokenizer = StanfordTokenizer('stanford-postagger.jar',
                                           options={"tokenizeNLs": True})

    def tokenize(self, text):
        return self.tokenizer.tokenize(text)

    def fix_missing_period(self, line):
        """Adds a period to a line that is missing a period"""

        if line == "":
            return line
        if line[-1] in self.END_TOKENS:
            return line
        return line + " ."

    def preprocess_text(self, text):
        """Preprocesses and prepares input text for summarization"""

        # Lowercase everything
        lines = [line.lower() for line in text]
        lines = [self.fix_missing_period(line) for line in lines]

        # Separate out text
        text_lines = []

        for idx, line in enumerate(lines):
            if line == "":
                continue  # empty line
            else:
                text_lines.append(line)

        # Make text into a single string
        text = ' '.join(text_lines)
        return text
Beispiel #14
0
class tokenizer(object):
	MY_ID = 'TOKENIZER'
	def __init__(self,mode=None):
		self.config = GetConfig()
		if mode:
			self.mode = mode
		else:
			if self.config.has_option(self.MY_ID,'mode'):
				self.mode = self.config.get(self.MY_ID,'mode')
			else:
				self.mode = 'NLTK'
		if self.mode == 'STANFORD':
			from nltk.tokenize.stanford import StanfordTokenizer as Tokenizer
			self.tokenizer = Tokenizer()
		elif self.mode == 'NLTK':
			pass
		elif self.mode == 'MINE':
			self.spacePunct = re.compile(ur'[`~!@#\$%\^&\*\(\)\[\]{}_\+\-=\|\\:;\"\'<>,\?/]')
			self.removePunct = re.compile(ur'\.')
		else:
			raise Exception('Error: tokenizer, Unknown mode %s!' %(self.mode))

	def tokenize(self, sent):
		if sent.endswith('-') or sent.endswith('~'):
			sent += ' '
		sent = sent.replace('~ ', ' ~ ')
		sent = sent.replace('- ', ' - ')
		if self.mode == 'STANFORD':
			tokens = self.tokenizer.tokenize(sent.strip())
		elif self.mode == 'NLTK':
			tokens = nltk.word_tokenize(sent.strip())
		elif self.mode == 'MINE':
			new_sent = sent.strip()
			new_sent = self.spacePunct.sub(' ', new_sent)
			new_sent = self.removePunct.sub('', new_sent)
			tokens = new_sent.split()
		p_sent = ' '.join(tokens)
		p_sent = p_sent.replace('% ', '%')
		p_sent = p_sent.replace('``', '\"')
		p_sent = p_sent.replace('\'\'', '\"')
		p_tokens = p_sent.split(' ')
		return p_tokens
Beispiel #15
0
    def Tok_handler(self, sentence, parser):
        if parser == "spacy":
            try:
                import spacy, en_core_web_sm
            except ImportError:
                print("Can't import spacy")
            nlp = en_core_web_sm.load()
            doc = nlp(sentence)
            return [str(token) for token in doc]
        elif parser == "nltk":
            try:
                import nltk
                from nltk.tokenize.stanford import StanfordTokenizer

                os.environ["CLASSPATH"] = "./StanfordNLP/jars"
                os.environ["STANFORD_MODELS"] = "./StanfordNLP/models"
            except ImportError:
                print("Can't import spacy")
            tokenizer = StanfordTokenizer()
            return tokenizer.tokenize(sentence)
Beispiel #16
0
    def __init__(self, sentence):

        en_parser = StanfordParser(
            path_to_jar=
            '../stanford-parser-full-2018-02-27/stanford-parser.jar',
            path_to_models_jar=
            '../stanford-parser-full-2018-02-27/stanford-parser-3.9.1-models.jar',
            model_path=
            '../stanford-parser-full-2018-02-27/stanford-parser-3.9.1-models/edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz'
        )
        sg = StanfordTokenizer(
            path_to_jar='../stanford-parser-full-2018-02-27/stanford-parser.jar'
        )
        self.trans = googletrans.Translator()

        self.sentence = sentence

        result1 = sg.tokenize(self.trans.translate(sentence).get_text())

        tree = list(en_parser.parse(result1))
        self.tree = tree[0]
        self.rel = []
Beispiel #17
0
class FanFictionHPSpellTokenizer(object):

    TOKENIZE_BY_SENTENCE = "sentence"
    TOKENIZE_BY_PARAGRAPH = "paragraph"
    TOKENIZE_AS_TEXT = "text"

    DUMMY_SEPARATOR = "DUMMY_SEPARATOR"

    def __init__(
            self,
            singleword_spells,
            multiword_spells,
            tokenize_by="text",  #tokenize_by="sentence",
            punkt_tokenizer='tokenizers/punkt/english.pickle',
            path_stanford_jar="/home/david/Descargas/stanford-corenlp-3.8.0.jar"
    ):

        self.singleword_spells = singleword_spells
        self.multiword_spells = multiword_spells
        self.multiword_spells_joint = [
            "_".join(s.split()) for s in multiword_spells
        ]
        self.tokenize_by = tokenize_by
        self.toktok = StanfordTokenizer(path_to_jar=path_stanford_jar)
        self.sent_detector = nltk.data.load(punkt_tokenizer)

    def tokenize(self, path):
        """
        Tokenize one texts at a time is slow
        """
        with codecs.open(path, encoding="utf-8") as f_fanfiction:
            fanfiction_story = f_fanfiction.read().lower()

        if self.tokenize_by == self.TOKENIZE_BY_SENTENCE:
            return self._tokenize_by_sentence(fanfiction_story)
        elif self.tokenize_by == self.TOKENIZE_BY_PARAGRAPH:
            return self._tokenize_by_paragraph(fanfiction_story)
        elif self.tokenize_by == self.TOKENIZE_AS_TEXT:
            return self._tokenize(fanfiction_story)
        else:
            raise NotImplementedError

    def _tokenize_by_sentence(self, text):

        output = []
        sentences = self.sent_detector.tokenize(text.strip())

        for s in sentences:
            for mws in self.multiword_spells:
                if mws in s:
                    s = s.replace(mws, "_".join(mws.split(" ")))

        joined_sentences = " " + self.DUMMY_SEPARATOR + " ".join(sentences)
        new_sentences = " ".join(self.toktok.tokenize(joined_sentences))

        return [
            s.split(" ") for s in new_sentences.split(self.DUMMY_SEPARATOR)
        ]

    def _tokenize(self, text):

        output = []
        sentences = " ".join(self.sent_detector.tokenize(text.strip()))

        for mws in self.multiword_spells:
            if mws in sentences:
                sentences = sentences.replace(mws, "_".join(mws.split(" ")))

        tokens = self.toktok.tokenize(sentences)
        output.append(tokens)

        return output

    def is_spell(self, token):
        """
        Tokens must have been obtained after processing the text with the method 
        tokenize()
        """
        return token in self.multiword_spells_joint or token in self.singleword_spells
Beispiel #18
0
class SMOCoder:
    def __init__(self):

        # set envirinment variable
        # TO DO: update to Docker path
        os.environ['CLASSPATH'] = resource_filename(__name__, 'tokenizers/')

        # load tokenizer and tagger
        # TO DO: again, update to Docker path
        self.STANFORD_TOKENIZER = StanfordTokenizer(
            resource_filename(__name__, 'tokenizers/stanford-ner-3.6.0.jar'))
        self.SMO_tagger = StanfordNERTagger(
            resource_filename(__name__,
                              'classifiers/ner-orgs_2016-03-28_all.ser.gz'))

    def getSMO(self, text, as_str=False):
        '''
        Extract social movement organizations from text using a custom trained Stanford NER tagger.

        :param text: text to extract social movement organizations from
        :type text: string

        :param as_str: logical indicating whether SMOs should be returned as a string. Defaults to False.
        :type as_str: boolean

        :return: SMOs extracted from text
        :rtype: set, or a string if as_str = True
        '''

        # Tokenize. What to do about <br /> ?
        tokens = self.STANFORD_TOKENIZER.tokenize(text)

        # Run tagging. This returns a list of tuples
        # classified as 'ORGANIZATION' if an SMO, 'O' otherwise
        tags = self.SMO_tagger.tag(tokens)

        current_SMO = ''
        all_SMOs = []

        # Note: Stanford NER tagger tags individual words as SMO or non-SMO.
        # For example, Black Lives Matter will be returned as ('Black', 'ORGANIZATION'), ('Lives', 'ORGANIZATION'), ('Matter', 'ORGANIZATION')
        # We want to parse this to a single organization.
        #
        # Non-perfect solution: assume that all consecutive ORGANIZATION tags represent a single SMO
        for tag in tags:

            # if tagged as organization, add to current SMO and skip ahead
            if 'ORGANIZATION' == tag[1]:

                if '' == current_SMO or "'" == tag[0]:
                    current_SMO = current_SMO + tag[0]
                else:
                    current_SMO = current_SMO + ' ' + tag[0]

                continue

            # adding test for unknown label
            if 'O' != tag[1]:
                print(
                    'Unknown tag ' + tag[1] +
                    ', skipping ahead. Could be worth investigating further.')

            # add last detected organization to list and reset current_SMO
            if '' != current_SMO:
                all_SMOs.append(current_SMO)
                current_SMO = ''

        # get unique elements
        all_SMOs = set(all_SMOs)

        if as_str:
            return '; '.join(all_SMOs)
        else:
            return all_SMOs
Beispiel #19
0
    with open('pairs_ronny{}.csv'.format(teststr), 'r+') as f:
        lines_pairs = f.readlines()[1:]
    with open('table_ronny.csv', 'r') as f:
        lines_table = f.readlines()

    # parse table
    entry_table = parse_table(lines_table, stanford)
    with open('dummy_tok.tables.jsonl', 'w') as f:
        json.dump(entry_table, f)

    # parse all pairs
    f = open('dummy_tok{}.jsonl'.format(teststr), 'w+')
    for line in lines_pairs:
        entry = dict(phase=2)
        # line.replace('"', '')
        # line = line.encode()
        line = line.split(',')
        sqlquery = line[0]
        question = line[1]
        entry['query'] = sqlquery
        entry['question'] = question
        entry['sql'] = parse_sql(sqlquery, entry_table['header'])
        entry['query_tok'] = stanford.tokenize(sqlquery)
        entry['question_tok'] = stanford.tokenize(question)
        entry['table_id'] = 'mock_time_machine'
        json.dump(entry, f)
        f.write('\n')

    f.close()
    os.system('cp dummy_tok{}.jsonl ../data/'.format(teststr))
Beispiel #20
0
CLASSPATH = CLASSPATH + ':' + path_api

print(CLASSPATH)

os.environ["CLASSPATH"] = CLASSPATH
os.environ['STANFORD_PARSER'] = path_corenlp
os.environ['STANFORD_MODELS'] = path_model



sent = "Kalla, it\'s a dog!"

from nltk.tokenize.stanford import StanfordTokenizer

tokenizer = StanfordTokenizer()
print(tokenizer.tokenize(sent))



from nltk.parse.stanford import StanfordParser

class MyParser(StanfordParser):
    def raw_parse_sents(self, sentences, verbose=False):
        """
        Use StanfordParser to parse multiple sentences. Takes multiple sentences as a
        list of strings.
        Each sentence will be automatically tokenized and tagged by the Stanford Parser.
        The output format is `wordsAndTags`.

        :param sentences: Input sentences to parse
        :type sentences: list(str)
Beispiel #21
0
class Values(object):
    def __init__(self, data, path, all_names):
        self.data = data
        self.path = path
        self.english_postagger = StanfordPOSTagger(
            path + 'models/english-left3words-distsim.tagger',
            path + 'lib/stanford-postagger-3.4.1.jar',
            java_options='-Xmx2g')
        self.english_tokenizer = StanfordTokenizer(
            path + 'lib/stanford-postagger-3.4.1.jar', 'utf-8')
        self.all_names = all_names
        self.pos = self.extract_POS()
        self.nms = self.extract_names()
        self.wg1 = self.extract_wordgrams(1)
        self.wg2 = self.extract_wordgrams(2)
        self.cg1 = self.extract_chargrams(1)
        self.cg2 = self.extract_chargrams(2)
        self.cg3 = self.extract_chargrams(3)
        self.bl = self.extract_breaklines()
        self.ws = self.extract_websites()

    def getVals(self):
        return self.bl, self.wg1, self.wg2, self.ws, self.nms, self.pos, self.cg1, self.cg2, self.cg3

    def extract_POS(self):
        return self.english_postagger.tag(
            self.english_tokenizer.tokenize(self.data))

    def extract_websites(self):
        websites = []
        result = re.findall('href=\"(.*?)\"', self.data)
        for r in result:
            if (r == 'mailto:') or (r == 'http:///'): continue
            else: websites.append(r)
        return websites

    def extract_breaklines(self):
        breaklines = []
        idx_old = 0
        idx_new = self.data.find('<br>')
        breaklines.append(idx_new - idx_old)
        idx_old = idx_new
        while idx_old < len(self.data):
            idx_new = self.data.find('<br>', idx_old + 4)
            if (idx_new == -1): break
            breaklines.append(idx_new - idx_old)
            idx_old = idx_new
        return breaklines

    def extract_chargrams(self, gram_size):
        return [
            ''.join(self.data[i:i + gram_size])
            for i in range(len(self.data) - gram_size + 1)
        ]

    def extract_wordgrams(self, gram_size):
        r = re.compile(r'[\s{}\t\n\r\+\>\<\=\¢\â\$]+'.format(
            re.escape(punctuation)))
        word_list = r.split(self.data)
        #word_list = re.split('\W+', self.data)
        #word_list = re.split(r'[\p{P} \\t\\n\\r\\+\\>\\<\\=\\¢\\â\\$]+', self.data)
        word_list = filter(None, word_list)
        return [
            ''.join(word_list[i:i + gram_size])
            for i in range(len(word_list) - gram_size + 1)
        ]

    def extract_names(self):
        r = re.compile(r'[\s{}\t\n\r\+\>\<\=\¢\â\$]+'.format(
            re.escape(punctuation)))
        word_list = r.split(self.data)
        #word_list = re.split('\W+', self.data)
        #word_list = re.split('[\p{P} \\t\\n\\r\\+\\>\\<\\=\\¢\\â\\$]+', self.data)
        word_list = filter(None, word_list)
        word_list = [x.lower() for x in word_list]
        return list(set(word_list) & set(self.all_names))
Beispiel #22
0
                                 strip_handles=True)

tokenized_datasets_original_tweet = [[
    tweet_tokenizer.tokenize(request) for request in dataset
] for dataset in datasets]

print("Retokenizing with Stanford tokenizer. This may take a long time.")

path_pos = "/playpen/home/tongn/stanford-postagger-full-2017-06-09/"
jar_pos = "stanford-postagger.jar"

tokenizer = StanfordTokenizer(path_pos + jar_pos)
tokenizer = StanfordTokenizer(tagger_path)

tokenized_datasets_original = [[
    tokenizer.tokenize(' '.join(request).strip()) for request in dataset
] for dataset in tokenized_datasets_original_tweet]
# tokenized_datasets_original = tokenized_datasets_original_tweet
"""
Convert all tokens to lowercase
"""
tokenized_datasets = [[[token.lower() for token in request]
                       for request in dataset]
                      for dataset in tokenized_datasets_original]
"""
Build the whole vocabulary

Vocab lists:
• special token: "UNK_TOKEN"
• vocab_shared: intersection of word2vec vocab and politeness vocab
• vocab_freq: frequent vocab that is not in word2vec vocab
Beispiel #23
0
def main(in_path, outpath):
    nltk.download()

    span_extractor = torch.load(os.path.join(EXPERIMENT,
                                             'best_span_extractor.tar'),
                                map_location='cpu')
    answer_verifier = torch.load(os.path.join(EXPERIMENT,
                                              'best_answer_verifier.tar'),
                                 map_location='cpu')
    span_extractor.use_cuda = False
    answer_verifier.use_cuda = False

    tokenizer = StanfordTokenizer(
        options={'ptb3Escaping':
                 True})  # same tokenizer used by lexical parser
    parser = StanfordParser(java_options='-mx5g')

    data = json.load(open(in_path, 'r'))['data']
    batches = []
    official_eval = {}
    official_eval_tokens = {}
    qaid_map = {}

    num_articles = len(data)
    for aidx in range(len(data)):
        article = data[aidx]
        print('\t- Article Count=%d/%d' % (aidx + 1, num_articles))
        for pidx, paragraph in enumerate(article['paragraphs']):
            passage, qas = paragraph['context'], paragraph['qas']
            passage = passage.replace(u'\xa0', ' ')
            sentences = sent_tokenize(passage)

            sentence_tokens = [
                tokenizer.tokenize(sentence) for sentence in sentences
            ]
            raw_trees = [
                list(s)[0] for s in list(
                    parser.parse_sents(sentence_tokens, verbose=True))
            ]
            squad_tree = TreePassage(raw_trees)

            for qidx, qa in enumerate(qas):
                question_sentences = sent_tokenize(qa['question'])
                question_tokens = []
                for s in question_sentences:
                    question_tokens += tokenizer.tokenize(s)

                batches.append(
                    Batch([{
                        'apid': 'apid',
                        'qa_id': qa['id'],
                        'context_squad_tree': squad_tree,
                        'question_tokens': question_tokens,
                        'answers': [],
                        'is_impossible': 0
                    }], False))

                qaid_map[qa['id']] = paragraph['context']

    span_extractor.eval()
    answer_verifier.eval()
    for idx, batch in enumerate(batches):
        qa_id = batch.qa_id[0]

        node_scores, expected_f1s, global_answer_score = span_extractor(
            batch, eval_system=True)
        score_confidence, predicted_node_idxs = node_scores.max(dim=1)
        score_confidence, predicted_node_idxs = (variable_to_numpy(
            score_confidence,
            False), variable_to_numpy(predicted_node_idxs, False))

        # Answer score = predicted has answer probability
        answer_score = answer_verifier(batch,
                                       predicted_node_idxs=predicted_node_idxs,
                                       eval_system=True)
        answer_proba = variable_to_numpy(
            Sigmoid()(answer_score),
            False)  # convert from tensor to numpy array
        global_answer_proba = variable_to_numpy(Sigmoid()(global_answer_score),
                                                False)

        has_answer_proba = (0.3 * score_confidence +
                            0.4 * global_answer_proba + 0.3 * answer_proba)[0]

        predicted_span = batch.trees[0].span(predicted_node_idxs[0])
        predicted_has_answer = has_answer_proba >= HAS_ANSWER_THRESHOLD

        predicted_text = tokens_to_text(predicted_span, qaid_map[qa_id])
        official_eval[qa_id] = predicted_text if predicted_has_answer else ''
        official_eval_tokens[qa_id] = ' '.join(
            predicted_span) if predicted_has_answer else ''

    json.dump(official_eval, open(outpath, 'w'))
Beispiel #24
0
@author: BurakKerim
'''

from nltk.tokenize import wordpunct_tokenize
from nltk.tag import StanfordPOSTagger
from nltk.tokenize.stanford import StanfordTokenizer

# May need to export JAVA_HOME,
# use export in linux in a decent way
import os

java_path = '/usr/lib/jvm/jdk1.8.0_31'
# java_path = 'C:\Program Files\Java\jdk1.8.0_31'

os.environ['JAVAHOME'] = java_path

tagger_home = '/media/burak/Data/Workspace/Library/'\
              'stanford-postagger-full-2018-02-27/'

model = tagger_home + 'models/english-bidirectional-distsim.tagger'

jar_file = tagger_home + 'stanford-postagger.jar'

tagger = StanfordPOSTagger(model, path_to_jar=jar_file)
tokenizer = StanfordTokenizer(path_to_jar=jar_file)

sentence = 'This sentence is a test sentence for test in a test environment.'

print(tagger.tag(wordpunct_tokenize(sentence)))
print(tagger.tag(tokenizer.tokenize(sentence)))
Beispiel #25
0
def test_stanford_tokenizer():
    files = os.listdir("/Users/ruben/Desktop/txt/")
    standfor = StanfordTokenizer()
    total = sum(len(standfor.tokenize(readfile(DOCS_TXT_ROOT + f))) for f in files)

    print "\nStanfordTokenizer total " + str(total)
Beispiel #26
0
len(relations)

sentences = []
e1 = []
e2 = []
for j, line in enumerate(lines):
    text = []
    temp = []
    t = line.split("<e1>")
    text.append(t[0])
    temp.append(t[0])

    t = t[1].split("</e1>")
    e1_text = text
    e1_text = " ".join(e1_text)
    e1_text = tokenizer.tokenize(e1_text)
    text.append(t[0])
    e11 = t[0]
    y = tokenizer.tokenize(t[0])
    y[0] += "E11"
    temp.append(" ".join(y))
    t = t[1].split("<e2>")
    text.append(t[0])
    temp.append(t[0])
    t = t[1].split("</e2>")
    e22 = t[0]
    e2_text = text
    e2_text = " ".join(e2_text)
    e2_text = tokenizer.tokenize(e2_text)
    text.append(t[0])
    text.append(t[1])
Beispiel #27
0
tokenizer = StanfordTokenizer(path_to_jar=jar_file)

s1 = "On a $50,000 mortgage of 30 years at 8 percent,"
" the monthly payment would be $366.88."

s2 = "\"We beat some pretty good teams to get here.\" Slocum said."

s3 = "Well, we couldn't have this predictable, cliche-ridden, "
"\"Touched by an Angel\" "
"(a show creator John Masius worked on) wanna-be if she didn't."

s4 = "I cannot work under these conditions!"

s5 = "The company spent $30,000,000 last year."

p = [s1, s2, s3, s4, s4]
par = ' '.join(p)


for s in p:
    print(word_tokenize(s))
    print(wordpunct_tokenize(s))
    print(tokenizer.tokenize(s))
    print()

for s in sent_tokenize(par):
    print(word_tokenize(s))
    print(wordpunct_tokenize(s))
    print(tokenizer.tokenize(s))
    print()
from nltk.tokenize.stanford import StanfordTokenizer
import os
os.environ['CLASSPATH'] = '/root/datasets/stanford-postagger-2018-10-16/'
with open('input_question.txt', 'r') as f:
    raw_q = f.read()
stanford = StanfordTokenizer()
print(stanford.tokenize(raw_q))