Example #1
0
def getTokens(texts):
    #global corpusTokens
    #global docsTokens
    stemmer = PorterStemmer()
    tokenizer = WordPunctTokenizer()

    allTokens = []
    #tokens=[]
    if type(texts) != type([]):
        texts = [texts]
    for s in texts:
        #toks = nltk.word_tokenize(s.lower())
        toks = tokenizer.tokenize(s)
        allTokens.extend(toks)
        #corpusTokens.extend(toks)
        #docsTokens.append(toks)

    allTokens_2 = [
        t.lower() for t in allTokens
        if len(t) > 2 and t.isalnum() and t not in stopwordsList
    ]
    #allTokens_an = [t2 for t2 in allTokens_2 if t2.isalnum()]
    #allTokens_stw = [t3 for t3 in allTokens if t3 not in stopwordsList]
    #allTokens_stw = [t3 for t3 in allTokens_an if t3 not in stopwordsList]
    allTokens_stem = [stemmer.stem(word)
                      for word in allTokens_2]  #allTokens_stw]
    final = [t for t in allTokens_stem if t not in stopwordsList]
    return final
Example #2
0
 def get_words(klass, text):
     tokenizer = WordPunctTokenizer()
     words = Counter()
     for word in tokenizer.tokenize(text):
         word = word.lower()
         if len(word) > 2 and not word in STOPWORDS:
             words[word] += 1
     return words
 def __init__(self, keywords):
     self.stemmer = PorterStemmer()
     self.stopwords = nltk.corpus.stopwords.words('english')
     self.tokenizer = WordPunctTokenizer()
     self.score = 0
     self.keywords = keywords
     #self.keywords = []
     '''for url in seedUrls:
Example #4
0
 def getEntity(self, word, originalcase):
     tokenizer = WordPunctTokenizer()
     for sentence in self.raw_sentences:
         words = tokenizer.tokenize(sentence.strip())
         if originalcase in words:
             entity = self.getEntityFromSentence(originalcase, words)
             if len(entity) > 1:
                 return " ".join(entity)
     return originalcase
Example #5
0
def getDocTokens(docText):
    stemmer = PorterStemmer()
    tokenizer = WordPunctTokenizer()
    docTokens = tokenizer.tokenize(docText)
    
    allTokens_2 = [t.lower() for t in docTokens if len(t)>2]
    allTokens_an = [t2 for t2 in allTokens_2 if t2.isalnum()]
    allTokens_stw = [t3 for t3 in allTokens_an if t3 not in stopwordsList]
    allTokens_stem = [stemmer.stem(word) for word in allTokens_stw]
    final = [t for t in allTokens_stem if t not in stopwordsList]
    return final
Example #6
0
def getTokenizedDocs(docs):
    docs_tokens=[]
    stemmer = PorterStemmer()
    tokenizer = WordPunctTokenizer()
    stopwords = nltk.corpus.stopwords.words('english')
    for text in docs:
        tokens = tokenizer.tokenize(text)
        clean = [token for token in tokens if token.isalnum()]
        clean = [token.lower() for token in clean if token.lower() not in stopwords] 
        clean = [token for token in clean if len(token) > 2]
        final = [stemmer.stem(word) for word in clean]
        docs_tokens.append(final)
    return docs_tokens
Example #7
0
def getTokens(doc): 
    #global stemmer
    stemmer = PorterStemmer()
    tokenizer = WordPunctTokenizer()
    stopwords = nltk.corpus.stopwords.words('english')
      
    webpage = open(doc,'r').read()
    text = nltk.clean_html(webpage)
    #tokens = nltk.word_tokenize(text)
    tokens = tokenizer.tokenize(text)
    clean = [token.lower() for token in tokens if token.lower() not in stopwords]
    final = [stemmer.stem(word) for word in clean]
    return final
Example #8
0
 def convertDoctoTFIDF(self,doc):
     stemmer = PorterStemmer()
     tokenizer = WordPunctTokenizer()
     stopwords_e = stopwords.words('english')
     tokens = tokenizer.tokenize(doc)
     clean = [token for token in tokens if token.isalnum()]
     clean = [token.lower() for token in clean if token.lower() not in stopwords_e] 
     clean = [token for token in clean if len(token) > 2]
     final_doc = [stemmer.stem(word) for word in clean]
     doc_tfidf=[]
     words = self.model[1]
     for i in range(0,len(words)):            
         tf = final_doc.count(words[i])  
         doc_tfidf.append((tf,words[i]))
     return doc_tfidf
 def convertDoctoTFIDF(self,doc):
     stemmer = PorterStemmer()
     tokenizer = WordPunctTokenizer()
     stopwords_e = stopwords.words('english')
     tokens = tokenizer.tokenize(doc)
     clean = [token for token in tokens if token.isalnum()]
     clean = [token.lower() for token in clean if token.lower() not in stopwords_e] 
     clean = [token for token in clean if len(token) > 2]
     final_doc = [stemmer.stem(word) for word in clean]
     doc_tfidf=[]
     words = self.model[1]
     for i in range(0,len(words)):            
         tf = final_doc.count(words[i])  
         doc_tfidf.append((tf,words[i]))
     return doc_tfidf
Example #10
0
    def __init__(self, data_dir, data_name, split_size, max_vocab_size,
                 max_enc_utt_len, max_dec_word_len, line_threshold):
        """"
        :param line_thres: how many line will be merged as encoding sentensce
        :param split_size: size of training:valid:test

        """

        self._data_dir = data_dir
        self._data_name = data_name
        self._cache_dir = os.path.join(data_dir, "utt_seq_split")
        if not os.path.exists(self._cache_dir):
            os.mkdir(self._cache_dir)

        self.tokenizer = WordPunctTokenizer().tokenize
        self.split_size = split_size
        self.max_vocab_size = max_vocab_size
        self.max_enc_utt_len = max_enc_utt_len
        self.max_dec_word_len = max_dec_word_len
        self.line_threshold = line_threshold

        utt_features = self.load_data()
        if utt_features is None:
            with open(os.path.join(data_dir, data_name), "rb") as f:
                utt_features = self._parse_file(f.readlines())
                self._create_corpus(utt_features, split_size)

        # clip train_y. Different word2seq, encoder don't need clipping, since it fixed history
#        self.train_y = self.clip_to_max_len(self.train_y)
#        self.valid_y = self.clip_to_max_len(self.valid_y)
#        self.test_y = self.clip_to_max_len(self.test_y)

# get vocabulary\
        self.vocab = self.load_vocab("vocab.txt")
Example #11
0
    def get_texts(self):
        """
        Parse documents from the .cor file provided in the constructor. Lowercase
        each document and ignore some stopwords.

        .cor format: one document per line, words separated by whitespace.
        """
        tokenizer = WordPunctTokenizer()
        #print CorpusCSV.stoplist
        #return self.getstream()
        for doc in self.getstream():
            #yield [word for word in doc.lower().split()]
                    #if word not in CorpusMiislita.stoplist]
            #yield doc
            yield [word for word in tokenizer.tokenize(doc.lower())
                   if word_ok(word)]
Example #12
0
def getTokenizedDoc(doc):
    #docs_tokens=[]
    stemmer = PorterStemmer()
    tokenizer = WordPunctTokenizer()
    stopwords = nltk.corpus.stopwords.words('english')
    #stopwords.extend(["last","time","week","favorite","home","search","follow","year","account","update","com","video","close","http","retweet","tweet","twitter","news","people","said","comment","comments","share","email","new","would","one","world"])
    stopwords.extend(["com","http","retweet","tweet","twitter"])
    tokens = tokenizer.tokenize(doc)
    tokens = [token for token in tokens if len(token) > 2]
    clean = [token for token in tokens if token.isalnum()]
    clean = [token.lower() for token in clean if token.lower() not in stopwords] 
    #clean = [token for token in clean if len(token) > 2]
    final = [stemmer.stem(word) for word in clean]
    final = [t for t in final if t not in stopwords]
    #docs_tokens.append(final)
    return final
Example #13
0
 def __init__(self,keywords):
     self.stemmer = PorterStemmer()
     self.stopwords = nltk.corpus.stopwords.words('english')
     self.tokenizer = WordPunctTokenizer()
     self.score = 0
     self.keywords = keywords
     #self.keywords = []
     '''for url in seedUrls:
Example #14
0
def cleanDoc(doc):
    stopset = stop_words
    stemmer = nltk.PorterStemmer()
    lemmatizer = nltk.WordNetLemmatizer()
    tokens = WordPunctTokenizer().tokenize(doc)
    clean = [token.lower() for token in tokens if token.lower() not in stopset and len(token) > 2]
    stemmed = [stemmer.stem(word) for word in clean]
    final = [lemmatizer.lemmatize(word) for word in stemmed]
    return final
Example #15
0
def getTokenizedDoc(doc):
    #docs_tokens=[]
    stemmer = PorterStemmer()
    tokenizer = WordPunctTokenizer()
    stopwords = nltk.corpus.stopwords.words('english')
    #stopwords.extend(["last","time","week","favorite","home","search","follow","year","account","update","com","video","close","http","retweet","tweet","twitter","news","people","said","comment","comments","share","email","new","would","one","world"])
    stopwords.extend(["com", "http", "retweet", "tweet", "twitter"])
    tokens = tokenizer.tokenize(doc)
    tokens = [token for token in tokens if len(token) > 2]
    clean = [token for token in tokens if token.isalnum()]
    clean = [
        token.lower() for token in clean if token.lower() not in stopwords
    ]
    #clean = [token for token in clean if len(token) > 2]
    final = [stemmer.stem(word) for word in clean]
    final = [t for t in final if t not in stopwords]
    #docs_tokens.append(final)
    return final
Example #16
0
 def extract_words(self, wid):
     """Updates db with previously unseen words and lemmas, and page unigrams"""
     words_file = gzip.open(self.words_file, 'a')
     page_file = gzip.open(os.path.join(self.wiki_filepath, '%i.gz' % wid), 'w')
     w = WordPunctTokenizer()
     qi = QueryIterator(get_config(), {'query': 'wid:%s AND iscontent:true' % str(wid), 'fields': 'id, wid, pageid, html_en', 'sort': 'id asc'})
     print 'starting extraction for wiki %s...' % str(wid)
     for doc in qi:
         print 'extracting words for %s...' % doc['id']
         page_file.write('\t%s\n' % doc['pageid'])
         for word in w.tokenize(doc.get('html_en', '').lower()):
             if word not in self.words:
                 self.words[word] = self.counter
                 words_file.write('%i\t%s\n' % (self.counter, word.encode('utf-8')))
                 self.counter += 1
             page_file.write('%i\n' % self.words.get(word, 0))
     page_file.close()
     words_file.close()
Example #17
0
def rouge(candidate, reference, n=2, verbose=False):
    """This is a basic implementation of ROUGE-N.  It calculates the
    n-gram recall of a candidate summary against a refrence
    summary.

    """
    tokenizer = WordPunctTokenizer()
    candidate = tokenizer.tokenize(candidate.lower())
    reference = tokenizer.tokenize(reference.lower())
    c_ngrams = set(ngrams(candidate, n))
    r_ngrams = set(ngrams(reference, n))
    cr_ngrams = [g for g in c_ngrams if g in r_ngrams]
    rouge_n = len(cr_ngrams) / len(r_ngrams)
    if verbose:
        print("{:d} matching {:d}-grams out of {:d} total.".format(
            len(cr_ngrams), n, len(r_ngrams)))
        print(cr_ngrams)
        print("ROUGE-{:d}: {:0.3f}".format(n, rouge_n))
    return rouge_n
def getTokens(texts):
    #global corpusTokens
    #global docsTokens
    stemmer = PorterStemmer()
    tokenizer = WordPunctTokenizer()
    
    allTokens=[]
    #tokens=[]
    if type(texts) != type([]):
        texts = [texts]
    for s in texts:
        #toks = nltk.word_tokenize(s.lower())
        toks = tokenizer.tokenize(s)
        allTokens.extend(toks)
        #corpusTokens.extend(toks)
        #docsTokens.append(toks)
   
    allTokens_2 = [t.lower() for t in allTokens if len(t)>2]
    allTokens_an = [t2 for t2 in allTokens_2 if t2.isalnum()]
    allTokens_stw = [t3 for t3 in allTokens_an if t3 not in stopwordsList]
    allTokens_stem = [stemmer.stem(word) for word in allTokens_stw]
    final = [t for t in allTokens_stem if t not in stopwordsList]
    return final
Example #19
0
def document_as_words(document):
    """
    There is probably an NLTK function somewhere that does already, but I
    couldn't find it.
    
    So this just converts a single document into a list of words which you
    can then use with the rest of these functions, to get a feature list
    which you can then classify.
    """ 
    stringbuf = StringIO.StringIO(document)
    
    word_tokenizer = WordPunctTokenizer()
    para_tokenizer = read_blankline_block
    sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    
    words = []
    
    for para in para_tokenizer(stringbuf):
        for sent in sent_tokenizer.tokenize(para):
            for word in word_tokenizer.tokenize(sent):
                words.append(word)
                
    return words
Example #20
0
class TrollAnalyzer:
    def __init__(self, model):
        self.model = model
        self.tokenizer = WordPunctTokenizer()
        self.scale_score = lambda k: k * 2 - 1

    def analyze(self, sentence):
        words = self.tokenizer.tokenize(sentence)
        predictions = {
            word: self.scale_score(self.model.predict_proba([word])[0][1])
            for word in words
        }
        total = self.scale_score(self.model.predict_proba([sentence])[0][1])
        return {"master": total, "tokenized": predictions}
Example #21
0
def get_nouns(raw_text, site):
    """ Returns a list of all the nouns or noun phrases found in the given text. """
    nouns = []
    try:
        cleaned_text = format_text_for_NER(raw_text, site)
        text_tokens = WordPunctTokenizer().tokenize(cleaned_text)
        for token_and_POS in nltk.pos_tag(text_tokens):
            try:
                POS = token_and_POS[1]
                if 'NN' == POS or 'NNS' == POS or 'NNP' == POS or 'NNPS' == POS or 'NP' == POS:
                    nouns.append(token_and_POS[0])
            except:
                continue
    except:
        return nouns
    return nouns
Example #22
0
def format_doc_for_sim_scoring(raw_doc):
    """ Tokenizes and filters/formats the words in the given document to be used during 
    similarity measurement. This method should be used both when a doc goes into the  
    corpus and when a doc is being compared to another doc for similarity. 
    @return: a list of tokens """
    stopset = set(stopwords.words('english'))
    stemmer = nltk.PorterStemmer()
    tokens = WordPunctTokenizer().tokenize(raw_doc)
    non_punct = [
        ''.join(ch for ch in token if not ch in string.punctuation)
        for token in tokens
    ]  # remove tokens that are purely punctuation
    clean_tokens = [
        token.lower() for token in non_punct
        if token.lower() not in stopset and len(token) > 2
    ]
    stemmed_tokens = [stemmer.stem(word) for word in clean_tokens]
    return ' '.join(stemmed_tokens).decode('latin-1')
class Scorer(object):
    #docs =[]
    #docs_length=[]
    def __init__(self):
        self.stemmer = PorterStemmer()
        self.stopwords = nltk.corpus.stopwords.words('english')
        self.tokenizer = WordPunctTokenizer()
        self.keywords = []
        self.score = 0

    def cleanDoc(self, doc):

        tokens = self.tokenizer.tokenize(doc)
        clean = [
            token.lower() for token in tokens
            if token.lower() not in self.stopwords and len(token) > 2
        ]
        final = [self.stemmer.stem(word) for word in clean]
        return final

    #def __init__(self,seedUrls):
    def __init__(self, keywords):
        self.stemmer = PorterStemmer()
        self.stopwords = nltk.corpus.stopwords.words('english')
        self.tokenizer = WordPunctTokenizer()
        self.score = 0
        self.keywords = keywords
        #self.keywords = []
        '''for url in seedUrls:
            page = Webpage(url)
            data = self.cleanDoc(page.text)
            for d in data:
                self.keywords.append(d)'''
# this function checks if the url words contain the keywords or not.
# the score given is calculated by finding how many keywords occur in the url.

    def calculate_score(self, url):
        words = url.getAllText().split()
        for w in self.keywords:
            if w in words:
                self.score += 1
        self.score = self.score / float(len(self.keywords))
        return self.score
Example #24
0
    def __init__(self, data_dir, data_name, split_size, max_vocab_size,
                 max_enc_len, max_dec_len, line_thres):
        """"
        :param line_thres: how many line will be merged as encoding sentensce
        :param split_size: size of training:valid:test

        """

        self._data_dir = data_dir
        self._data_name = data_name
        self._cache_dir = os.path.join(
            data_dir,
            data_name.replace(".txt", "_") + "word_seq_split")
        if not os.path.exists(self._cache_dir):
            os.mkdir(self._cache_dir)

        self.tokenizer = WordPunctTokenizer().tokenize
        self.line_threshold = line_thres
        self.split_size = split_size
        self.max_vocab_size = max_vocab_size
        self.max_enc_len = max_enc_len
        self.max_dec_len = max_dec_len
        # try to load from existing file
        if not self.load_data():
            with open(os.path.join(data_dir, data_name), "rb") as f:
                self._parse_file(f.readlines(), split_size)

        # clip data
        self.train_x, self.train_y = self.clip_to_max_len(
            self.train_x, self.train_y)
        self.valid_x, self.valid_y = self.clip_to_max_len(
            self.valid_x, self.valid_y)
        self.test_x, self.test_y = self.clip_to_max_len(
            self.test_x, self.test_y)

        # get vocabulary\
        self.vocab = self.get_vocab()

        self.print_stats("TRAIN", self.train_x, self.train_y)
        self.print_stats("VALID", self.valid_x, self.valid_y)
        self.print_stats("TEST", self.test_x, self.test_y)
Example #25
0
class Scorer(object):
    #docs =[]
    #docs_length=[]
    def __init__(self):
        self.stemmer = PorterStemmer()
        self.stopwords = nltk.corpus.stopwords.words('english')
        self.tokenizer = WordPunctTokenizer()
        self.keywords = []
        self.score = 0

    def cleanDoc(self,doc):
        
        tokens = self.tokenizer.tokenize(doc)
        clean = [token.lower() for token in tokens if token.lower() not in self.stopwords and len(token) > 2]
        final = [self.stemmer.stem(word) for word in clean]
        return final
    
    #def __init__(self,seedUrls):
    def __init__(self,keywords):
        self.stemmer = PorterStemmer()
        self.stopwords = nltk.corpus.stopwords.words('english')
        self.tokenizer = WordPunctTokenizer()
        self.score = 0
        self.keywords = keywords
        #self.keywords = []
        '''for url in seedUrls:
            page = Webpage(url)
            data = self.cleanDoc(page.text)
            for d in data:
                self.keywords.append(d)'''
# this function checks if the url words contain the keywords or not.
# the score given is calculated by finding how many keywords occur in the url.
    def calculate_score(self,url):
        words = url.getAllText().split()        
        for w in self.keywords:
            if w in words:
                self.score +=1
        self.score = self.score / float(len(self.keywords))
        return self.score
    review = ' '.join(review)
    #review =' '.join((item for item in review if not item.isdigit()))
    corpus.append(review)
    


## lets find the unique words##
corpus_word_count= corpus   

count =' '.join([str(elm) for elm in corpus_word_count ]) 

from nltk.tokenize.regexp import WordPunctTokenizer

#This tokenizer also splits our string into tokens:

my_toks = WordPunctTokenizer().tokenize(count)

len(my_toks)

## unique word count

my_vocab = set(my_toks)
len(my_vocab)    ## 6087
    
type(corpus)


##logistic regression....###
from sklearn.linear_model import LogisticRegression

vectorizer = TfidfVectorizer()
Example #27
0
#!/usr/bin/env python
import sys

from nltk.tokenize.regexp import WordPunctTokenizer

filename = sys.argv[1]

with open(filename, 'r') as f:
    contents = f.read()

from nltk.tokenize import wordpunct_tokenize
wptk = WordPunctTokenizer()
tokenized = wptk.tokenize(contents)

lower_list = []
for word in tokenized:
    l = word.lower()
    lower_list.append(l)

from collections import Counter
counts = Counter(lower_list)
#counts_list = sorted(lower_list, key=lambda x: (counts[x], x), reverse=True)

print(counts)
def stem_text(text):
    tokenizer = WordPunctTokenizer()
    stemmer = SnowballStemmer('french')
    liste_racines = [stemmer.stem(token) for token in tokenizer.tokenize(text)]
    return ' '.join(liste_racines)
Example #29
0
 def __init__(self):
     self.stemmer = PorterStemmer()
     self.stopwords = nltk.corpus.stopwords.words('english')
     self.tokenizer = WordPunctTokenizer()
     self.keywords = []
     self.score = 0
Example #30
0
        document += sentence + '\n'
        # Adjust number to make sure that a document doesn't exceed 15000 tokens
        if i % 500 == 0:
            documents.append(document)
            document = '\n'
    documents.append(document)

    print(f'Amount documents: {len(documents)}\n')

    # Check word count for every document
    # for i,doc in enumerate(documents):
    #     word_count = len(doc.split())
    #     print(f'Doc {i}, word_count: {ord_count}')

    # Check token count for every document (should not be more than 15000)
    for i, doc in enumerate(documents):
        token_count = WordPunctTokenizer().tokenize(doc)
        # print(token_count)
        print(f'Doc {i}, token count: {len(token_count)}')
    ##
    # Feed each text document separately to BERN
    for i, doc in enumerate(tqdm(documents), 14):
        BERN_annotated_df = BERN_annotation(params, i, doc, sentences)
        # If csv doesn't exist yet, create one, and append to it in the next iterations
        with open('./output/subtitles/BERN_annotated_subtitles.csv', 'a') as f:
            BERN_annotated_df.to_csv(f,
                                     header=f.tell() == 0,
                                     index=False,
                                     sep=';')
        time.sleep(5)
Example #31
0
 def __init__(self, nltkTokenizer=WordPunctTokenizer()):
     self.nltkTokenizer = nltkTokenizer
Example #32
0
- name abbreviations:  E. Talvik ; M. Unt

See https://github.com/estnltk/estnltk/issues/25 for more info.
"""
from __future__ import unicode_literals, print_function, absolute_import

from nltk.tokenize.regexp import WordPunctTokenizer
from nltk.tokenize.api import StringTokenizer

import regex as re

EST_ALPHA_LOWER = 'abcdefghijklmnoprsšzžtuvwõäöüxyz'
EST_ALPHA_UPPER = EST_ALPHA_LOWER.upper()
EST_ALPHA = EST_ALPHA_LOWER + EST_ALPHA_UPPER

wptokenizer = WordPunctTokenizer()
digits = re.compile('\d+')

#  Listing of different hypen/minus/dash symbols in utf8;
#  It is likely that these symbols are used interchangeably with the regular hypen symbol;
hypens_dashes = re.compile(
    '^(-|\xad|\u2212|\uFF0D|\u02D7|\uFE63|\u002D|\u2010|\u2011|\u2012|\u2013|\u2014|\u2015|\u2212)$'
)


def join_ordinals(left, right):
    return right == '.' and digits.match(left) is not None


def join_hyphen(left, right):
    return hypens_dashes.match(left) or hypens_dashes.match(right)
Example #33
0
    return False


current_key = None
qtokens = None
reduced_keys = {}

for line in sys.stdin:
    query, tweet = line.split('\t', 1)
    if current_key != query:
        for last_tweet in reduced_keys:
            # TODO Reduce Processing Phase 2
            # more processing against key's corpus
            print json.dumps(last_tweet)
        current_key = query
        qwords = WordPunctTokenizer().tokenize(query)
        qtokens = [w for w in qwords if not remove_if_punct_or_stopword(w)]
        reduced_keys = []
    tweet = json.loads(tweet)

    # match query terms into tweet
    matches = 0
    for token in qtokens:
        if token in tweet['parsed']:
            matches += 1

    tweet['tokens'] = qtokens
    tweet['matches'] = matches
    tweet['cxScore'] = float(matches) / len(qtokens)
    tweet['totScore'] = tweet['cxScore'] + tweet['qlScore']
    # TODO Reduce Processing Phase 1
Example #34
0
 def tokenize_text(self, text):
     words = WordPunctTokenizer().tokenize(text)
     return words
 def __init__(self):
     self.stemmer = PorterStemmer()
     self.stopwords = nltk.corpus.stopwords.words('english')
     self.tokenizer = WordPunctTokenizer()
     self.keywords = []
     self.score = 0
    def handle(self, *args, **options):
        super(Command, self).handle(self, *args, **options)

        for database in self.selected_dbs:

            print "Processing database " + database
            cursor = connections[database].cursor()

            get_authors_query = """
                SELECT name, long_name
                  FROM author
            """

            get_comments_query = """
                SELECT id, content FROM comment
            """

            cursor = connections[database].cursor()
            tokenizer = WordPunctTokenizer()

            self.stdout.write("Grabbing authors...")
            authors = []
            cursor.execute(get_authors_query)
            self.pbar_setup(maxval=cursor.rowcount)
            for row in dictfetch(cursor):
                authors.append(row['name'])
                if row['long_name'] != "":
                    authors.append(row['long_name'])
                self.pbar_increment()
            self.pbar_destroy()

            self.stdout.write("Sorting authors...")
            authors.sort()

            self.stdout.write("Determining real parents...")
            real_parents = []
            cursor1 = connections[database].cursor()
            cursor1.execute(get_comments_query)
            self.pbar_setup(maxval=cursor1.rowcount)
            for row in dictfetch(cursor1):
                tokens = tokenizer.tokenize(row['content'])
                if len(tokens) > 0:
                    if tokens[0] == '@':
                        real_parents.append(int(row['id']))
                    else:
                        i = bisect_left(authors, tokens[0])
                        if i != len(authors) and authors[i] == tokens[0]:
                            real_parents.append(int(row['id']))

                self.pbar_increment()
            self.pbar_destroy()

            self.stdout.write("Non-Real-parents found: {}".format(
                len(real_parents)))

            cursor2 = connections[database].cursor()
            update_query = """
                UPDATE comment 
                   SET real_parent = (CASE WHEN id in ({}) THEN 0 ELSE 1 END)            
            """.format(('%s,' * len(real_parents)).rstrip(','))
            cursor2.execute(update_query, real_parents)

        self.stdout.write(self.style.SUCCESS('Command executed succesfully'))
Example #37
0
    def __init__(self,):

        NltkTokenizer.__init__(self)
        _WordPunctTokenizer.__init__(self,)
Example #38
0
#!/usr/bin/env python3.4

import os
import random
from collections import Counter

from nltk.tokenize.regexp import WordPunctTokenizer

nltk_tokenizer = WordPunctTokenizer()


def tokenize(text):
    return nltk_tokenizer.tokenize(text.lower())


def loadcorpus():
    """Load the corpus of abstracts and documents."""
    dirname = "cmplg-txt"
    abstracts = {}
    documents = {}
    for fn in sorted(os.listdir(dirname)):
        docid = fn.split("-")[0]
        if fn.endswith("abstract.txt"):
            with open(os.path.join(dirname, fn), 'r') as f:
                abstracts[docid] = f.read()
        if fn.endswith("sentences.txt"):
            with open(os.path.join(dirname, fn), 'r') as f:
                documents[docid] = f.readlines()
    return abstracts, documents

Example #39
0
 def __init__(self, model):
     self.model = model
     self.tokenizer = WordPunctTokenizer()
     self.scale_score = lambda k: k * 2 - 1