Beispiel #1
0
def main():
    with open("sentiment.txt", 'r') as _file:
        stemmer = PorterStemmer()
        features = []

        for words in _file:
            feature = []
            is_sentence = True

            # 極性ラベルを除外
            for word in words.split()[1:]:
                try:
                    word = word.decode("utf-8")
                    if word not in [".", ",", ":", "?", "!"] \
                            and not has_stop_list(word):

                        feature.append(stemmer.stem(word))
                except UnicodeDecodeError:
                    # 文字化けは無視する
                    is_sentence = False
                    break

            if is_sentence:
                features.append(feature)

    return features
Beispiel #2
0
def make_tags(title_string):
    stemmer = PorterStemmer()
    ret = []
    for word in title_string.split():
        if word not in stop_words:
            ret.append(stemmer.stem_word(word.lower()))
    return ret
def stemming(line_list):
    """
    Input: line_list (list of strings(sentences/documents)) - e.g. dataset.data

    Iterates over all terms in lines, stem them

    Return: stemmed_list (list of strings(terms that stemmed))
    """
    stemmed_list = []
    stemmer = PorterStemmer()
    for i, line in enumerate(line_list):
        # linercase
        line = line.lower()
        # remove punctuation
        # below method will simply remove punctuation, but mistakes such as amazon.com => amazoncom
        # nopunct_line = ''.join([c for c in line
                                            # if re.match("[a-z\-\' \n\t]", c)])
        # this solve the problem above:
        nopunct_line = re.sub('[^A-Za-z0-9]+', ' ', line)                                            
        # tokenize
        line_token = wt(nopunct_line)
        # list to store stemmed terms
        stemmed_line = []
        for term in line_token:
            term = stemmer.stem_word(term)
            stemmed_line.append(term)
        # back to sentence as a string
        stemmed_sentence = ' '.join(stemmed_line)
        stemmed_list.append(stemmed_sentence)
    return stemmed_list
    def stemm(cls, tokens):
        stemmer = PorterStemmer()

        for i, t in enumerate(tokens):
            tokens[i] = stemmer.stem(t)

        return tokens
def process_email(filename):
  
  f = open(filename, 'r')
  text = f.read()
  f.close()
  
  text = text.lower()
  
  #replaces html tags by space
  text = re.sub(r'<[^<>]+>', ' ', text)
  
  #replaces numbers by word number
  text = re.sub(r'[0-9]+', 'number', text)
  
  #replaces URLs by word httpaddr
  text = re.sub(r'(http|https)://[^\s]*', 'httpaddr', text)
  
  #replaces email addresses by word emailaddr
  text = re.sub(r'[^\s]+@[^\s]+', 'emailaddr', text)
  
  #replaces dollar signs with word dollar 
  text = re.sub(r'[$]+', 'dollar', text)
  
  #removes punctuation and non-words and separates words 
  words = re.split('[^a-z0-9]| ', text)
  
  #removes nans 
  words = filter(lambda x: x!='', words)
  
  #reduces words to their stems
  stemmer = PorterStemmer()
  words = [stemmer.stem(word) for word in words]
  
  return words
Beispiel #6
0
    def _log_likelihood(answer_text, stemmed_vocabulary, distrib_matrix):
        LL = 0
        if answer_text is not '':
            tokens = word_tokenize(str(answer_text), language='english')
            porter_stemmer = PorterStemmer()
            unique_wordcount = len(stemmed_vocabulary)
            """
            per ogni w unica print_function words
                Cw = conta w in answer_text
                PwM = self.distrib_matrix[stemmer(w)]
                unique_wordcount = len(tokenize(answer_text)
            """
            for w in tokens:
                _w = w.strip().lower()
                Cw = 0
                for _ in answer_text.split():
                    if _w == _.strip().lower():
                        Cw += 1

                try:
                    w_stem = porter_stemmer.stem(_w.decode('utf-8', 'replace').encode('ascii', 'replace'))
                except AttributeError:
                    w_stem = porter_stemmer.stem(_w)
                try:
                    PwM = distrib_matrix[w_stem]
                except KeyError:  # key error means frequency is equal to cutoff point 1
                    PwM = 1
                LL += (Cw * log(float(PwM)))

            try:
                LL = "{0:.2f}".format(LL / float(unique_wordcount))
            except ZeroDivisionError:
                LL = 0 

        return LL
Beispiel #7
0
def openAndProcessingFiles(path,resultDict):  # Main Function

    for filename in os.listdir(os.getcwd()+path):

        thisFile = open(os.getcwd()+path+'/'+filename,'r') #open the file and process each file
        
        currentTextString = " ".join(thisFile.read().split())#store the file as a string for removing HTML tags
        
        textAfterHtmlRemovingString = re.sub('<[^>]*>', '', currentTextString) # remove HTML tags (String)
        
        textAfterHtmlRemovingList = textAfterHtmlRemovingString.split() # convert String to List for the text contains only characters
        
        textRemoveingUnnecessaryCharactersList = [removeUnnecessaryCharacters(word) for word in textAfterHtmlRemovingList ] 

        textRemoveingUnnecessaryCharactersList = [word for word in textRemoveingUnnecessaryCharactersList if word is not None]
        
        stop_words = set(stopwords.words('english'))
        
        stop_words.update(['texthtml', 'html', 'server', "email", 'date', 'gmt', 'www']) # By analying the previous result set, continully adding new stopwords
    
        textAfterStopwordsRemovingList = [word for word in textRemoveingUnnecessaryCharactersList if word not in stop_words] #remove stopwords

        stemmer = PorterStemmer() #stemming
        
        for eachWord in textAfterStopwordsRemovingList:
            eachWord = stemmer.stem(eachWord)
            storeToResultDict(eachWord,resultDict)
    
        thisFile.close()
Beispiel #8
0
    def review_to_words(raw_review, remove_stopwords = False):
        # BeautifulSoup pulls data out of html file
        # here it removes html tags and markups
        text = BeautifulSoup(raw_review).get_text()

        # replace numbers by word number
        text=re.sub(r'[0-9]+','number',text)

        # remove punctuations (they can be analyzed for better results)
        text = re.sub(r'[^a-zA-Z]', ' ', text)
        text = text.lower()

        #make a list of words
        words_list = text.split()

        #download nltk text data sets, including stop words
        #nltk.download()

        if remove_stopwords:
            # get stopwords, searching a set is faster than searching a list
            stops = set(stopwords.words('english'))
            # remove stopwords
            words_list = [word for word in words_list if not word in stops]

        # reduce words to their stems
        stemmer=PorterStemmer()
        words_list=[stemmer.stem(word) for word in words_list]
        # return the list of words
        return words_list
Beispiel #9
0
    def get_ngram_features(self):

        stemmer = PorterStemmer()

        top_features = [(stemmer.stem(token) + "__TOP__", True) for token in self.top_text]
        bottom_features = [(stemmer.stem(token) + "__BOTTOM__", True) for token in self.bottom_text]
        all_features = [(stemmer.stem(token) + "__ALL__", True) for token in self.all_text]
        self.ngram_features = dict(top_features + bottom_features + all_features)
def normalize(word):
    '''
    normalize the the word for query or indexing
    :param word: unicode string
    :return: unicode string of the normalized ter
    '''
    porter = PorterStemmer()
    return porter.stem(word) if word[0].isalpha() else ''
Beispiel #11
0
  def __process_email(self, email_contents, vocab):
    '''
    Preprocess a the body of an email and returns a
    list of word_indices.

    Arguments:
      email_contents (str): Email body.
      vocab (dict): Words dictionary.

    Return:
      (str list): Tokenized email body after processing.
    '''
    # Lower case.
    email_contents = email_contents.lower()

    # Strip all HTML
    # Looks for any expression that starts with < and ends with > and replace
    # and does not have any < or > in the tag it with a space
    email_contents = re.sub('<[^<>]+>', ' ', email_contents)

    # Handle Numbers
    # Look for one or more characters between 0-9
    email_contents = re.sub('[0-9]+', 'number', email_contents)

    # Handle URLS
    # Look for strings starting with http:// or https://
    email_contents = re.sub('(http|https)://[^\s]*', 'httpaddr', email_contents)

    # Handle Email Addresses
    # Look for strings with @ in the middle
    email_contents = re.sub('[^\s]+@[^\s]+', 'emailaddr', email_contents)

    # Handle $ sign
    email_contents = re.sub('[$]+', 'dollar', email_contents)

    # Tokenize and also get rid of any punctuation
    word_list = re.split(' |@|$|/|#|\.|-|:|&|\*|\+|=|[|]|\?|!|(|)|{|}|,|''|"|>|_|<|;|%',
                        email_contents)

    # Remove empty string and skip the word if it is too short.
    word_list = [s for s in word_list if s and len(s) > 1]

    # Remove any non alphanumeric characters
    word_list = [re.sub('[^a-zA-Z0-9]', '', s) for s in word_list]

    # Remove empty string and skip the word if it is too short.
    word_list = [s for s in word_list if s and len(s) > 1]

    # Stem the word
    ps = PorterStemmer() 
    word_list = [ps.stem_word(s) for s in word_list]
    word_indices = []

    # Find index in vocab list.
    for w in word_list:
      if w in vocab:
        word_indices.append(vocab[w])
    return word_indices
Beispiel #12
0
 def processContent(self, content):
     stemmer = PorterStemmer()
     tokens = word_tokenize(content)
     tokens = filter(lambda x: len(x) < 20 and x.isalnum(), tokens)
     tokens = [stemmer.stem(token.lower()) for token in tokens]
     tokens = filter(lambda x: x not in stopwords.words('english'), tokens)
     tokens = [str(token) for token in tokens]      
     bow = FreqDist(tokens)
     return(bow)
 def getStemmedWords(self,html):
     
     stemmed_words=[]
     #stemmer = SnowballStemmer("english")
     stemmer = PorterStemmer()
     for token in html:
         stemmed_words.append(stemmer.stem_word(token))
         
     return ' '.join(stemmed_words)
Beispiel #14
0
def main():
    # Use file defined by BIOC_IN as default if no other provided
    bioc_in = BIOC_IN
    if len(sys.argv) >= 2:
        bioc_in = sys.argv[1]
    
    # A BioCReader object is put in place to hold the example BioC XML
    # document
    bioc_reader = BioCReader(bioc_in, dtd_valid_file=DTD_FILE)
    
    # A BioCWRiter object is prepared to write out the annotated data
    bioc_writer = BioCWriter(BIOC_OUT)
    
    # The NLTK porter stemmer is used for stemming
    stemmer = PorterStemmer()
    
    # The example input file given above (by BIOC_IN) is fed into
    # a BioCReader object; validation is done by the BioC DTD
    bioc_reader.read()
    
    # Pass over basic data
    bioc_writer.collection = bioc_reader.collection
    
    # Get documents to manipulate
    documents = bioc_writer.collection.documents
    
    # Go through each document
    annotation_id = 0
    for document in documents:
        
        # Go through each passage of the document
        for passage in document:
            #  Stem all the tokens found
            stems = [stemmer.stem(token) for 
                     token in wordpunct_tokenize(passage.text)]
            # Add an anotation showing the stemmed version, in the
            # given order
            for stem in stems:
                annotation_id += 1
                
                # For each token an annotation is created, providing
                # the surface form of a 'stemmed token'.
                # (The annotations are collectively added following
                #  a document passage with a <text> tag.)
                bioc_annotation = BioCAnnotation()
                bioc_annotation.text = stem
                bioc_annotation.id = str(annotation_id)
                bioc_annotation.put_infon('surface form', 
                                          'stemmed token')
                passage.add_annotation(bioc_annotation)
    
    # Print file to screen w/o trailing newline
    # (Can be redirected into a file, e. g output_bioc.xml)
    sys.stdout.write(str(bioc_writer))
    
    # Write to disk
    bioc_writer.write()
 def stemmingword(word_list, stemtype='porter'):
     if stemtype == 'porter':
         stemengine = PorterStemmer()
     else:
         stemengine = LancasterStemmer()
     try:
         filtered_words = [stemengine.stem(token).encode('latin-1', errors='ignore') for token in word_list]
     except UnicodeDecodeError, e:
         print 'Error en el tipo de caracteres descartando texto "{}"'.format(' '.join(word_list))
Beispiel #16
0
def getPosWords():
  stemmer = PorterStemmer()
  stemmedPosTokens = []
  pos = open(r'pos.txt').read()
  pos = re.sub("\d", "", pos)
  posWords = nltk.word_tokenize(pos)
  for posWord in posWords:
    stemmedPosWord = stemmer.stem(posWord)
    stemmedPosTokens.append(stemmedPosWord.lower())
  return stemmedPosTokens
def preprocess( result ):
    words = removePunct(result.title)
    words += " "
    words += removePunct(result.snippet)
    result.tokens = nltk.word_tokenize(words)
    for tok in result.tokens:
        if tok not in STOPS:
            tok = PorterStemmer().stem(tok.decode('utf-8'))
            tok = tok.lower().encode('utf-8')
    return result
Beispiel #18
0
 def tokenize(self, sentence, do_stopwords, do_stemming,use_bigrams):
         words = word_tokenize(sentence)
         words = [w.lower() for w in words if len(w) > 2]
         if do_stopwords:
                 words = [w for w in words if w not in stop_set]
         if do_stemming:
                 stemmer = PorterStemmer()
                 words = [stemmer.stem(w) for w in words]
         if use_bigrams:
                 words = bigrams(words)
         return words
Beispiel #19
0
def update_Porter_stemming(): #We use stems occasionally.
    "Updating stems from Porter algorithm..."
    from nltk import PorterStemmer
    stemmer = PorterStemmer()
    cursor.execute("""SELECT word FROM words WHERE wordid <= 750000 and stem is null;""")
    words = cursor.fetchall()
    for local in words:
        word = ''.join(local)
        if re.match("^[A-Za-z]+$",word):
            query = """UPDATE words SET stem='""" + stemmer.stem(''.join(local)) + """' WHERE word='""" + ''.join(local) + """';""" 
            z = cursor.execute(query)
Beispiel #20
0
 def stemmer(self, raw):
     """
     Use porter stemmer from nltk library 
     to stem tokens in raw text.
     """
     tokens = word_tokenize(raw)
     porter = PorterStemmer()
     # lancaster = LancasterStemmer()
     # stem_lancaster = [lancaster.stem(t) for t in tokens]
     stem_porter = [porter.stem(t) for t in tokens]
     return stem_porter
Beispiel #21
0
def getNegWords():  
  stemmer = PorterStemmer()
  stemmedNegTokens = []

  neg = open(r'neg.txt').read()
  neg = re.sub("\d", "", neg)
  negWords = nltk.word_tokenize(neg)
  for negWord in negWords:
    stemmedNegWord = stemmer.stem(negWord)
    stemmedNegTokens.append(stemmedNegWord.lower())
  return stemmedNegTokens
Beispiel #22
0
def getUncertainWords():  
  stemmer = PorterStemmer()
  stemmedUnTokens = []

  un = open(r'uncertain.txt').read()
  un = re.sub("\d", "", un)
  unWords = nltk.word_tokenize(un)
  for unWord in unWords:
    stemmedUnWord = stemmer.stem(unWord)
    stemmedUnTokens.append(stemmedUnWord.lower())
  return stemmedUnTokens
Beispiel #23
0
def clean_data_to_feed_classifier(tweests):
	st = PorterStemmer()
	stop = stopwords.words('english')
	parsed_tweests = []
	for x in tweests:
		y=x[0]
		y = ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",y).split())
		y = ' '.join(re.sub(r'(.)\1+', r'\1\1', i.lower())  for i in y.split() if i not in stop)
		y = ' '.join(st.stem(i) for i in y.split() if len(i) > 3 and i.isalpha() and wordnet.synsets(i))
		# y = punctuations_repl(y)
		parsed_tweests.append(y)
	return parsed_tweests
Beispiel #24
0
 def buildTrainTokensBigram(self): 
   self.trainTokens = []
   with open(self.trainingData, 'r') as reviews:
     for review in reviews:
       data = json.loads(review)
       words = word_tokenize(data['text'])
       words = [norm(word) for word in words if norm(word)]
       words = [word for word in words if word not in stwords]
       stemmer = PorterStemmer()
       words = [stemmer.stem(word) for word in words]
       featureSet = self.buildWordFeatureSetBigram(words)
       self.trainTokens.append((featureSet, data['stars']))
def normalize_data(lines):
    norm_words = []
    punctuation = ['!', '.', ';', ':', '\'', '"','`','?']
    exceptions = ['\n', '\'s', '\'t', " "]
    stemmer = PorterStemmer()
    stop = stopwords.words('english')
    mega_stop_list = list(itertools.chain(punctuation, exceptions))
    print "    Now Normalizing......."
    for sentence in lines:
        words = [stemmer.stem(word.lower()) for word in word_tokenize(sentence.rstrip("\\n")) if word not in [stop, "not"]]
        norm_words.extend([word for word in negate_Ngram(words) if not re.match("[0-9]+", word) if word.lower() not in mega_stop_list])
    return norm_words
def tokenize_normalize(raw):
    '''
    tokenize raw texts
    :param raw: unicode string
    :return: list[unicode]: a list of tokenized unicode
    Example: words = tokenize_normalize(line)
    '''
    tokens = [t for t in word_tokenize(raw) if len(t) < 20]  # don't use any token too long (like genetic sequence)
    porter = PorterStemmer()
    tokens_n = [porter.stem(t) for t in tokens if t[0].isalpha()]  # only interested in word
    tokens_n = ['NUMBER' if all(a.isdigit() for a in t) else t for t in tokens_n]  # combine all numbers to one
    return tokens_n
Beispiel #27
0
def formatText(text):
    text = text.lower()
    text = text.replace('.',' ')
    text = text.replace('\\',' ')
    text = text.replace('/',' ')
    text = text.replace('\"',' ')
    text = text.replace('\'',' ')
    text = text.replace(':',' ')
    text = text.replace(';',' ')
    text = text.replace('(',' ')
    text = text.replace(')',' ')
    porter = PorterStemmer()
    return ' '.join([porter.stem(word) for word in text.split(' ')])
def text_preprocessing(text):
    #lowercase everything
    text = text.lower()
    #remove punctuation
    regex = re.compile('[%s]' % re.escape(string.punctuation))
    text = regex.sub(" ", text)
    #remove stopwords
    no_stopwords = [word for word in text.split() if word.lower() not in ext_stopwords]
    text = " ".join(no_stopwords)
    #stem the words
    stemmer = PorterStemmer()
    text = " ".join([stemmer.stem(w) for w in text.split()])
    return text
Beispiel #29
0
 def review_mapper(self, _, data):
   review = data['text']
   rating = data['stars']
   business_id = data['business_id']
   category = data['category']
   words = word_tokenize(review)
   words = [norm(word) for word in words if norm(word)]
   words = [word for word in words if word not in stwords]
   tagged_words = tagger.tag(words)
   stemmer = PorterStemmer()
   tagged_words = [(stemmer.stem(tagged_word[0]), tagged_word[1]) for tagged_word in tagged_words]
   for tagged_word in tagged_words:
     yield (category, tagged_word), (business_id, rating, 1)
 def update_Porter_stemming(self): #We use stems occasionally.
     print "Updating stems from Porter algorithm..."
     from nltk import PorterStemmer
     stemmer = PorterStemmer()
     cursor = db.query("""SELECT word FROM words""")
     words = cursor.fetchall()
     for local in words:
         word = ''.join(local) #Could probably take the first element of the tuple as well?
         #Apostrophes have the save stem as the word, if they're included        
         word = word.replace("'s","")
         if re.match("^[A-Za-z]+$",word):
             query = """UPDATE words SET stem='""" + stemmer.stem(''.join(local)) + """' WHERE word='""" + ''.join(local) + """';"""
             z = cursor.execute(query)
Beispiel #31
0
def run(
    lr=0.001,
    batsize=20,
    epochs=100,
    embdim=64,
    encdim=128,
    numlayers=1,
    dropout=.25,
    wreg=1e-10,
    cuda=False,
    gpu=0,
    minfreq=2,
    gradnorm=3.,
    beamsize=1,
    cosine_restarts=1.,
    seed=456789,
):
    # DONE: Porter stemmer
    # DONE: linear attention
    # DONE: grad norm
    # DONE: beam search
    # DONE: lr scheduler
    print(locals())
    torch.manual_seed(seed)
    np.random.seed(seed)
    tt = q.ticktock("script")
    device = torch.device("cpu") if not cuda else torch.device("cuda", gpu)
    tt.tick("loading data")
    stemmer = PorterStemmer()
    tokenizer = lambda x: [stemmer.stem(xe) for xe in x.split()]
    ds = GeoQueryDatasetFunQL(
        sentence_encoder=SequenceEncoder(tokenizer=tokenizer),
        min_freq=minfreq)

    train_dl = ds.dataloader("train", batsize=batsize)
    test_dl = ds.dataloader("test", batsize=batsize)
    tt.tock("data loaded")

    do_rare_stats(ds)

    # batch = next(iter(train_dl))
    # print(batch)
    # print("input graph")
    # print(batch.batched_states)

    model = create_model(embdim=embdim,
                         hdim=encdim,
                         dropout=dropout,
                         numlayers=numlayers,
                         sentence_encoder=ds.sentence_encoder,
                         query_encoder=ds.query_encoder,
                         feedatt=True)

    # model.apply(initializer)

    tfdecoder = SeqDecoder(
        model,
        tf_ratio=1.,
        eval=[
            CELoss(ignore_index=0, mode="logprobs"),
            SeqAccuracies(),
            TreeAccuracy(
                tensor2tree=partial(tensor2tree, D=ds.query_encoder.vocab))
        ])

    losses = make_array_of_metrics("loss", "elem_acc", "seq_acc", "tree_acc")
    # beamdecoder = BeamActionSeqDecoder(tfdecoder.model, beamsize=beamsize, maxsteps=50)
    if beamsize == 1:
        freedecoder = SeqDecoder(
            model,
            maxtime=100,
            tf_ratio=0.,
            eval=[
                SeqAccuracies(),
                TreeAccuracy(
                    tensor2tree=partial(tensor2tree, D=ds.query_encoder.vocab))
            ])

        vlosses = make_array_of_metrics("seq_acc", "tree_acc")
    else:
        print("Doing beam search!")
        freedecoder = BeamDecoder(
            model,
            beamsize=beamsize,
            maxtime=60,
            eval=[
                SeqAccuracies(),
                TreeAccuracy(
                    tensor2tree=partial(tensor2tree, D=ds.query_encoder.vocab))
            ])

        vlosses = make_array_of_metrics("seq_acc", "tree_acc")
    # # test
    # tt.tick("doing one epoch")
    # for batch in iter(train_dl):
    #     batch = batch.to(device)
    #     ttt.tick("start batch")
    #     # with torch.no_grad():
    #     out = tfdecoder(batch)
    #     ttt.tock("end batch")
    # tt.tock("done one epoch")
    # print(out)
    # sys.exit()

    # beamdecoder(next(iter(train_dl)))

    # print(dict(tfdecoder.named_parameters()).keys())

    # 4. define optim
    optim = torch.optim.Adam(tfdecoder.parameters(), lr=lr, weight_decay=wreg)
    # optim = torch.optim.SGD(tfdecoder.parameters(), lr=lr, weight_decay=wreg)

    # lr schedule
    if cosine_restarts >= 0:
        # t_max = epochs * len(train_dl)
        t_max = epochs
        print(f"Total number of updates: {t_max} ({epochs} * {len(train_dl)})")
        lr_schedule = q.WarmupCosineWithHardRestartsSchedule(
            optim, 0, t_max, cycles=cosine_restarts)
        reduce_lr = [lambda: lr_schedule.step()]
    else:
        reduce_lr = []

    # 6. define training function (using partial)
    clipgradnorm = lambda: torch.nn.utils.clip_grad_norm_(
        tfdecoder.parameters(), gradnorm)
    trainbatch = partial(q.train_batch, on_before_optim_step=[clipgradnorm])
    trainepoch = partial(q.train_epoch,
                         model=tfdecoder,
                         dataloader=train_dl,
                         optim=optim,
                         losses=losses,
                         _train_batch=trainbatch,
                         device=device,
                         on_end=reduce_lr)

    # 7. define validation function (using partial)
    validepoch = partial(q.test_epoch,
                         model=freedecoder,
                         dataloader=test_dl,
                         losses=vlosses,
                         device=device)
    # validepoch = partial(q.test_epoch, model=tfdecoder, dataloader=test_dl, losses=vlosses, device=device)

    # 7. run training
    tt.tick("training")
    q.run_training(run_train_epoch=trainepoch,
                   run_valid_epoch=validepoch,
                   max_epochs=epochs)
    tt.tock("done training")
 def __init__(self):
     super().__init__()
     self._stemmer = PorterStemmer()
Beispiel #33
0
def tfidf_classifier(fname):
    with open(fname + ".txt", "r") as file:
        paragraph = file.read()

    #clean the extracted content
    paragraph = " ".join(re.findall(r"\b[a-z0-9]+\b", paragraph,
                                    flags=re.I)).lower()

    #get the part of speech for every word in the content
    pos_tag_words = pos_tag(paragraph.split())
    porter_stemmer_obj = PorterStemmer()
    stem = porter_stemmer_obj.stem
    pos_tag_words = [(str(stem(tag[0])),
                      tag[-1]) if tag[-1].startswith("VB") else tag
                     for tag in pos_tag_words]
    paragraph = " ".join([w[0] for w in pos_tag_words])

    #extract all the nouns, adjectives, adverbs and verbs from the paragraph
    temp_noun_adj_list = []
    temp_verb_adv_list = []
    all_words = []
    all_words_count_dict = {}
    for pos_words in pos_tag_words:
        if (pos_words[-1].startswith("NN") or pos_words[-1].startswith("JJ")):
            temp_noun_adj_list.append(pos_words[0])
            if len(temp_verb_adv_list) > 1:
                adv_verb_str = " ".join(temp_verb_adv_list)
                if adv_verb_str not in all_words_count_dict:
                    all_words_count_dict[adv_verb_str] = paragraph.count(
                        adv_verb_str)
                temp_verb_adv_list = []
            elif temp_verb_adv_list:
                if temp_verb_adv_list[0] not in all_words_count_dict:
                    all_words_count_dict[
                        temp_verb_adv_list[0]] = paragraph.count(
                            temp_verb_adv_list[0])
                temp_verb_adv_list = []
        elif pos_words[-1].startswith("VB"):
            temp_verb_adv_list.append(pos_words[0])
            if len(temp_noun_adj_list) > 1:
                adj_noun_str = " ".join(temp_noun_adj_list)
                if adj_noun_str not in all_words_count_dict:
                    all_words_count_dict[adj_noun_str] = paragraph.count(
                        adj_noun_str)
                temp_noun_adj_list = []
            elif temp_noun_adj_list:
                if temp_noun_adj_list[0] not in all_words_count_dict:
                    all_words_count_dict[
                        temp_noun_adj_list[0]] = paragraph.count(
                            temp_noun_adj_list[0])
                temp_noun_adj_list = []
        elif pos_words[-1].startswith("RB"):
            temp_verb_adv_list.append(pos_words[0])
            if len(temp_noun_adj_list) > 1:
                adj_noun_str = " ".join(temp_noun_adj_list)
                if adj_noun_str not in all_words_count_dict:
                    all_words_count_dict[adj_noun_str] = paragraph.count(
                        adj_noun_str)
                temp_noun_adj_list = []
            elif temp_noun_adj_list:
                if temp_noun_adj_list[0] not in all_words_count_dict:
                    all_words_count_dict[
                        temp_noun_adj_list[0]] = paragraph.count(
                            temp_noun_adj_list[0])
                temp_noun_adj_list = []
        else:
            if temp_noun_adj_list:
                adj_noun_str = " ".join(temp_noun_adj_list)
                if adj_noun_str not in all_words_count_dict:
                    all_words_count_dict[adj_noun_str] = paragraph.count(
                        adj_noun_str)
                temp_noun_adj_list = []
            if temp_verb_adv_list:
                adv_str = " ".join(temp_verb_adv_list)
                if adv_str not in all_words_count_dict:
                    all_words_count_dict[adv_str] = paragraph.count(adv_str)
                temp_verb_adv_list = []

    if len(temp_noun_adj_list) > 0:
        adj_noun_str = " ".join(temp_noun_adj_list)
        if adj_noun_str not in all_words_count_dict:
            all_words_count_dict[adj_noun_str] = paragraph.count(adj_noun_str)
    if len(temp_verb_adv_list) > 0:
        adv_str = " ".join(temp_verb_adv_list)
        if adv_str not in all_words_count_dict:
            all_words_count_dict[adv_str] = paragraph.count(adv_str)

    with open(fname + ".json", "w") as file:
        json.dump(all_words_count_dict, file)
Beispiel #34
0
def feature_maker(embed_file, dataframe, embed_signal='n'):
    '''takes a path to embeddings file, dataframe as input - default keyword
    embed-signal means that embeddings are not encoded by default
    returns an expanded dataframe with:
    a column of lemmatised words; a column of stemmed words; a column indicating
    capitalisation status; a column indicating capilatisation status of previous
    token; columns indicating shape, previous shape, short shape, previous
    short shape, following token short shape.
    If kwarg embed_signal is 'y', a list of embeddings is also generated.

    '''

    wnl = WordNetLemmatizer()
    prtr = PorterStemmer()
    stringed_list = [str(x) for x in dataframe['token']]
    wn_lemma_list = [wnl.lemmatize(t) for t in stringed_list]
    dataframe['lemma'] = wn_lemma_list
    prtr_stemmer_list = [prtr.stem(t) for t in stringed_list]
    dataframe['stem'] = prtr_stemmer_list

    dataframe['caps'] = 'no caps'
    dataframe.loc[dataframe['token'].str.contains('^[A-Z][a-z]'),
                  ['caps']] = 'begin_cap'
    dataframe.loc[dataframe['token'].str.contains('[A-Z][A-Z]'),
                  ['caps']] = 'all_caps'
    dataframe.loc[dataframe['token'].str.contains('[a-z][A-Z]]'),
                  ['caps']] = 'caps_inside'

    temp_list = dataframe['caps'].to_list()
    temp_list.insert(0, 'no_cap')
    temp_list.pop()
    dataframe['prev_caps'] = temp_list

    dataframe['short_shape'] = 'x'
    dataframe.loc[dataframe['token'].str.contains('^[A-Z][a-z]'),
                  ['short_shape']] = 'Xx'
    dataframe.loc[dataframe['token'].str.contains('[A-Z][A-Z]'),
                  ['short_shape']] = 'XX'
    dataframe.loc[dataframe['token'].str.contains('[a-z][A-Z]]'),
                  ['short_shape']] = 'xXx'
    dataframe.loc[dataframe['token'].str.contains('\W'), ['short_shape']] = '-'

    prev_short_shape_list = []
    prev_short_shape_list = dataframe['short_shape'].to_list()
    prev_short_shape_list.insert(0, '-')
    prev_short_shape_list.pop()
    dataframe['prev_short_shape'] = prev_short_shape_list

    next_short_shape_list = []
    next_short_shape_list = dataframe['short_shape'].to_list()
    next_short_shape_list.pop(0)
    next_short_shape_list.append('-')
    dataframe['next_short_shape'] = next_short_shape_list

    shape_list = []
    pre_list = []
    suf_list = []
    for text in dataframe['token']:

        prefix = text[:3]
        suffix = text[-3:]
        pre_list.append(prefix)
        suf_list.append(suffix)
        replace_caps = re.sub('[A-Z]', 'X', text)
        replace_lowers = re.sub('[a-z]', 'x', replace_caps)
        replace_digits = re.sub('\d', 'd', replace_lowers)

        shape_list.append(replace_digits)

    dataframe['shape'] = shape_list

    prev_shape_list = []
    prev_shape_list = dataframe['shape'].to_list()
    prev_shape_list.insert(0, '-')
    prev_shape_list.pop()
    dataframe['prev_shape'] = prev_shape_list

    dataframe['prefix'] = pre_list
    dataframe['suffix'] = suf_list

    if embed_signal == 'y':
        word_embedding_model = gensim.models.KeyedVectors.load_word2vec_format(
            embed_file, binary=True)
        embeddings = []
        for token in dataframe['token']:
            if token in word_embedding_model:
                vector = word_embedding_model[token]
            else:
                vector = [0] * 300
            embeddings.append(vector)
        return dataframe, embeddings
    else:
        return dataframe
Beispiel #35
0
class NGramAligner(Aligner):
    def __init__(self):
        self.stemmer = PorterStemmer()

    def align(
        self,
        source: Doc,
        targets: List[Doc],
    ) -> List[Dict]:

        alignments = []
        source_ngram_spans = self._get_ngram_spans(source)
        for target in targets:
            target_ngram_spans = self._get_ngram_spans(target)
            alignments.append(
                self._align_ngrams(target_ngram_spans, source_ngram_spans))
        return alignments

    def _get_ngram_spans(
        self,
        doc: Doc,
    ):
        ngrams = []
        for sent in doc.sents:
            for n in range(1, len(list(sent))):
                tokens = [t for t in sent if not (t.is_stop or t.is_punct)]
                ngrams.extend(_ngrams(tokens, n))

        def ngram_key(ngram):
            return tuple(
                self.stemmer.stem(token.text).lower() for token in ngram)

        key_to_ngrams = itertoolz.groupby(ngram_key, ngrams)
        key_to_spans = {}
        for k, grouped_ngrams in key_to_ngrams.items():
            key_to_spans[k] = [(ngram[0].i, ngram[-1].i + 1)
                               for ngram in grouped_ngrams]
        return key_to_spans

    def _align_ngrams(
        self, ngram_spans_1: Dict[Tuple[str], List[Tuple[int, int]]],
        ngram_spans_2: Dict[Tuple[str], List[Tuple[int, int]]]
    ) -> Dict[Tuple[int, int], List[Tuple[int, int]]]:
        """Align ngram spans between two documents
        Args:
            ngram_spans_1: Map from (normalized_token1, normalized_token2, ...) n-gram tuple to a list of token spans
                of format (start_pos, end_pos)
            ngram_spans_2: Same format as above, but for second text
        Returns: map from each (start, end) span in text 1 to list of aligned (start, end) spans in text 2
        """
        if not ngram_spans_1 or not ngram_spans_2:
            return {}
        max_span_end_1 = max(
            span[1]
            for span in itertools.chain.from_iterable(ngram_spans_1.values()))
        token_is_available_1 = [True] * max_span_end_1  #
        matched_keys = list(
            set(ngram_spans_1.keys())
            & set(ngram_spans_2.keys()))  # Matched normalized ngrams betwee
        matched_keys.sort(
            key=len, reverse=True)  # Process n-grams from longest to shortest

        alignment = defaultdict(
            list
        )  # Map from each matched span in text 1 to list of aligned spans in text 2
        for key in matched_keys:
            spans_1 = ngram_spans_1[key]
            spans_2 = ngram_spans_2[key]
            available_spans_1 = [
                span for span in spans_1
                if all(token_is_available_1[slice(*span)])
            ]
            matched_spans_1 = []
            if available_spans_1 and spans_2:
                # if ngram can be matched to available spans in both sequences
                for span in available_spans_1:
                    # It's possible that these newly matched spans may be overlapping with one another, so
                    # check that token positions still available (only one span allowed ber token in text 1):
                    if all(token_is_available_1[slice(*span)]):
                        matched_spans_1.append(span)
                        token_is_available_1[slice(
                            *span)] = [False] * (span[1] - span[0])
            for span1 in matched_spans_1:
                alignment[span1] = spans_2

        return alignment
Beispiel #36
0
from collections import defaultdict
import re
import json
from nltk import PorterStemmer
from nltk.corpus import words
import math
import string
#asfasfasf

INDEX_DICT = {}
#DOC_ID_DICT = {}
directory = "C:\\Users\\tajun\\PycharmProjects\\ICS-121\\DevlopZip\\DEV"
doc_counter = 0
partial_counter = 0
NumOfDocs = 0
ps = PorterStemmer()
token_count = 0
output_dict = {}  #where {filenum;(word,[list of postings]}
skip_count = 0


class Postings:  #each doc id is a posting?
    def __init__(self, docid, positions):
        self.docid = docid
        self.positions = positions
        self.tfidf = 0  # use freq counts for now

    #   self.fields = fields


#takes in a file name to tokenize and return a list of tokens//should return a list of lists? where first element is tok, second is count, third is and so on.
Beispiel #37
0
def stem(a):
	a = a.strip('0123456789.,"[]()?!: ')
	a = PorterStemmer().stem_word(a)
	return a
# Lowercase the corpus
processed = raw_text.str.lower()

# Remove punctuation, white spaces
processed = processed.str.replace(r'[^\w\d\s]', ' ')
processed = processed.str.replace(r'\s+', ' ')
processed = processed.str.replace(r'^\s+|\s+?$', '')

# Remove stop words
stop_words = stopwords.words('english')
processed = processed.apply(lambda x: ' '.join(term for term in x.split()
                                               if term not in set(stop_words)))

# Remove word stems using a Porter stemmer
porter = PorterStemmer()
processed = processed.apply(
    lambda x: ' '.join(porter.stem(term) for term in x.split()))

# Construct a design matrix using an n-gram model and a tf-idf statistics
vectorizer = TfidfVectorizer(ngram_range=(1, 2))
features = vectorizer.fit_transform(processed)

# Prepare the training and test sets using an 80/20 split
X_train, X_test, y_train, y_test = train_test_split(features,
                                                    labels_enc,
                                                    test_size=0.2,
                                                    random_state=4,
                                                    stratify=labels_enc)

# Train SVM with a linear kernel on the training set
Beispiel #39
0
def preProssess(filename):

    # Local get for text file
    file = open(filename, "r", encoding='utf-8')
    # Use this to read file content as a stream:
    fullText = file.read()
    sentences = fullText.split('\n')
    spell = Speller(fast=True)

    stop_words = set(stopwords.words('english'))
    Documents = {}
    vocab = {}
    for i in range(len(sentences)):
        # split by tab
        currSentenceTuple = sentences[i].split('\t')
        # [docid, setence]
        # the key is the tweetid and the value is the tweet text
        # (fill sentence, dictionary of words and their weights, length)
        Documents[currSentenceTuple[0]] = (currSentenceTuple[1], {}, 0)
        # start the preprocessing here
        currSentenceValue = currSentenceTuple[1]
        # lower case
        currSentenceValue = currSentenceValue.lower()
        # remove URLS
        currSentenceValue = re.sub(r'http\S+', '', currSentenceValue)
        # create our tokenizer that will also remove punctuation
        tokenizer = RegexpTokenizer(r'\w+')
        # removing the I'm , can't to Im and cant
        currSentenceValue = currSentenceValue.replace("'", "")

        #autocorrect spelling mistakes
        currSentenceValue = spell(currSentenceValue)

        # tokenize here
        currSentenceValue = tokenizer.tokenize(currSentenceValue)
        # remove stop words
        porterStemmer = PorterStemmer()
        currSentenceValue = [porterStemmer.stem(w) for w in currSentenceValue if not w in stop_words]

        # finished preprossessing tweet
        # send the preprocessed tweet to be indexed
        (vocab, Documents) = indexing(currSentenceValue, currSentenceTuple, Documents, vocab)

    tf_max = 0
    for word in vocab:
        if vocab[word][0] > tf_max:
            tf_max = vocab[word][0]

    numOfDocs = len(Documents)
    for docid in Documents:
        length = 0
        for wordsInDoc in Documents[docid][1]:
            df_i = vocab[wordsInDoc][0]
            idf = math.log((numOfDocs / df_i), 2)
            tf_ij = Documents[docid][1][wordsInDoc] / len(Documents[docid][1])
            w_ij = tf_ij * idf
            Documents[docid][1][wordsInDoc] = w_ij
            length += w_ij ** 2
        (doc, sentence, l) = Documents[docid]
        Documents[docid] = (doc, sentence, math.sqrt(length))
    return (vocab, Documents)
Beispiel #40
0
    def ordered_stems(self) -> List[str]:
        from nltk import PorterStemmer

        stemmer = PorterStemmer()
        return [stemmer.stem(w) for w in self.tokens]
Beispiel #41
0
    def stemmed_labels(self) -> Set[str]:
        from nltk import PorterStemmer

        stemmer = PorterStemmer()
        return {stemmer.stem(label) for label in self.labels}
def stemIt(word_list, stemmer= PorterStemmer(), encoding= "utf8"):
    tmp = []
    for w in word_list:
        tmp.append(stemmer.stem(w).encode(encoding))
    return tmp
Beispiel #43
0
def queryResults(queryString, vocabDict, documents, numberOfRowsForResults):
    stop_words = set(stopwords.words('english'))
    scores = {}
    N = len(documents)
    queryString = queryString.lower()
    #queryStringExpansion = queryExpansionMethod(model_glove_twitter,queryString)
    queryStringExpansion = queryString
    # create our tokenizer that will also remove punctuation
    tokenizer = RegexpTokenizer(r'\w+')
    # removing the I'm , can't to Im and cant
    queryString = queryString.replace("'", "")
    # tokenize here
    queryString = tokenizer.tokenize(queryString)
    # remove stop words
    porterStemmer = PorterStemmer()
    queryString = [
        porterStemmer.stem(w) for w in queryString if not w in stop_words
    ]

    # we are collecting the weights for the query string and it's length
    weightsForQuery = {}
    lengthOfQuery = 0
    for stemword in queryString:
        if stemword.isnumeric():
            continue
        #adding check here so see if the stem word is actually in our vocab. If it's not then we can simply skip it
        if stemword not in vocabDict:
            continue
        # docsFoundForStemWord = vocabDict[stemword]
        # calculate weight for query word i
        df_i = vocabDict[stemword][0]
        tf_iq = queryString.count(stemword) / len(queryString)
        idf = math.log((N / df_i), 2)
        w_iq = (0.5 + 0.5 * tf_iq) * idf
        if stemword not in weightsForQuery:
            weightsForQuery[stemword] = w_iq
            lengthOfQuery += w_iq**2

    # we now have the length of the query vector and a dict of weights w_iq
    lengthOfQuery = math.sqrt(lengthOfQuery)

    # print(weightsForQuery)

    for word in weightsForQuery:
        docsFoundForStemWord = vocabDict[word][1]
        for doc in docsFoundForStemWord:
            scores[doc] = cosineCalculator(doc, documents, lengthOfQuery,
                                           weightsForQuery)

    arrayOfSortedScoresTuples = sorted(scores.items(),
                                       key=lambda x: x[1],
                                       reverse=True)
    #here we add a dictionary that will store the documents and their new scores on the query expansion
    arrayOfSortedScoresTuplesExpanded = {}
    for i in range(len(arrayOfSortedScoresTuples)):
        docId = arrayOfSortedScoresTuples[i][0]
        originalScore = arrayOfSortedScoresTuples[i][1]
        docSentence = documents[docId][0]  #get sentence
        #get the tokens in our twitter embedding model
        tokens_1 = [t for t in docSentence.split() if t in model_glove_twitter]
        tokens_2 = [
            t for t in queryStringExpansion.split() if t in model_glove_twitter
        ]
        cosine = 0
        if (len(tokens_1) > 0 and len(tokens_2) > 0):
            cosine = model_glove_twitter.n_similarity(tokens_1, tokens_2)
            #take the average of both scores!
            newScoreAvg = (originalScore + cosine) / 2
            #store the score with the document
            arrayOfSortedScoresTuplesExpanded[docId] = newScoreAvg
    #sort by highest value!
    arrayOfSortedScoresTuplesExpanded = sorted(
        arrayOfSortedScoresTuplesExpanded.items(),
        key=lambda x: x[1],
        reverse=True)
    return arrayOfSortedScoresTuplesExpanded[:numberOfRowsForResults]
def stemming_by_portter_1(term):
    return PorterStemmer().stem(term)
import nltk
import pandas as pd
import numpy as np
import pickle
import re
from nltk.corpus import stopwords
from nltk import PorterStemmer, WordNetLemmatizer

data = pd.read_csv('spam.csv', sep=',', encoding='latin-1')
data.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1, inplace=True)

data['Type'] = data['Type'].map({'ham': 0, 'spam': 1})
X = data['Message']
y = data['Type']

stem = PorterStemmer()
corpus = []

for i in range(len(data)):
    words = re.sub('[^a-zA-Z]', ' ', data['Message'][i])
    words = words.lower()
    words = words.split()
    words = [
        stem.stem(word) for word in words
        if word not in set(stopwords.words('english'))
    ]
    words = ' '.join(words)
    corpus.append(words)

#creating BagOfWords
from sklearn.feature_extraction.text import CountVectorizer
def tokenizer(direc_path):
    # get stop-list
    # change working directory to specified one
    # process all files and tokenize

    # get all lines of stop-list
    # remove newline character at the end
    fp = open('stoplist.txt', 'r')
    stoplist = list(fp)
    fp.close()
    for i in range(len(stoplist)):
        stoplist[i] = stoplist[i][:-1]

    # change path to where corpus is
    current_dir = os.getcwd()
    os.chdir(direc_path)
    flist = []
    flist.extend(os.listdir(direc_path))

    term_dictionary = {}
    doc_dictionary = {}
    doc_id = 1
    term_id = 1
    # main loop
    for fname in os.listdir():

        # read file and add its name and ID to a dictionary
        fp = open(fname, 'r', errors='ignore')
        content = fp.read()
        fp.close()
        doc_dictionary[doc_id] = fname
        doc_id += 1

        # ignoring initial headers
        substr = "<!DOCTYPE"
        index = content.find(substr)
        htmlcode = content[index:]
        # get parsed result
        result = parsehtml(htmlcode)

        # Tokenize and turn to lower case
        token_list = nltk.regexp_tokenize(result, "[A-Z]{2,}(?![a-z])|[A-Z][a-z]+(?=[A-Z])|[\'\w\-]+")
        for i in range(len(token_list)):
            token_list[i] = token_list[i].lower()
        # print(token_list)
        # print(len(token_list))

        # ignore tokens if they're in stop list
        i = 0
        deleteflag = False
        while i < len(token_list):
            for s in stoplist:
                if s == token_list[i]:
                    del token_list[i]
                    deleteflag = True
                    break
            if deleteflag:
                deleteflag = False
            else:
                i += 1

        # print(len(token_list))

        # stem the token list
        stemmer = PorterStemmer()
        for i in range(len(token_list)):
            token_list[i] = stemmer.stem(token_list[i])


        # put terms as key in dictionary with incremented term id as value
        for i in range(len(token_list)):
            if token_list[i] not in term_dictionary:
                term_dictionary[token_list[i]] = term_id
                term_id = term_id + 1

    # write doc dictionary to file, format is term id /t term
    # term ids become keys and terms become values
    os.chdir(current_dir)
    f = open('docids.txt', 'w')
    for value, key in doc_dictionary.items():
        f.write(str(key) + '\t' + str(value) + '\n')
    f.close()

    # write term dictionary to file
    f = open('termids.txt', 'w', errors='ignore')
    for key, value in term_dictionary.items():
        f.write(str(value) + '\t' + str(key) + '\n')
    f.close()
    return (term_dictionary,doc_dictionary)
Beispiel #47
0
 def stem_it(self):
     stemmer = PorterStemmer()
     self.word = stemmer.stem(self.word)
Beispiel #48
0
#  CliNER - word_features.py                                         #
#                                                                    #
#  Willie Boag                                      [email protected] #
#                                                                    #
#  Purpose: Isolate all word-level features into a single file       #
######################################################################

import re
from cliner.features_dir.wordshape import getWordShapes
from nltk import LancasterStemmer, PorterStemmer

__author__ = 'Willie Boag'
__date__ = 'Apr 27, 2014'

lancaster_st = LancasterStemmer()
porter_st = PorterStemmer()


def feature_word(word):
    return {('word', word.lower()): 1}


def feature_stem_lancaster(word):
    return {('stem_lancaster', lancaster_st.stem(word.lower())): 1}


def feature_generic(word):
    generic = re.sub('[0-9]', '0', word)
    return {('Generic#', generic): 1}

Beispiel #49
0
 def __init__(self):
     self.stemmer = PorterStemmer()
Beispiel #50
0
@author: nausheenfatma
"""
import sys
import logging
from XMLCustomParser import WikiXmlHandler
import xml.sax
import time
from datetime import datetime
import ast
from nltk import PorterStemmer
import argparse
#from MergeIndices import batch_sort
from MergeIndices import MergeIndices

sno = PorterStemmer()

punctuation = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
remove_punctuation_map = dict((ord(char), 32) for char in punctuation)
number = '0123456789'
remove_number_map = dict((ord(char), None) for char in number)


class Document():
    def __init__(self):
        self.doc_id = {}
        self.title = {}
        self.body = {}
        self.infobox = {}
        self.categories = {}
        self.external_links = {}
def stem_words(f):
	stemmer=PorterStemmer()
	processed=tokenize(f)
	for i in range(len(processed)):
		processed[i]=stemmer.stem(processed[i])
	return processed
Beispiel #52
0
def try_basic_query_tokenizer():
    stemmer = PorterStemmer()
    x = "answer(cityid('new york', _))"
    y = basic_query_tokenizer(x, strtok=lambda x: [stemmer.stem(xe) for xe in x.split()])
from nltk.stem.wordnet import WordNetLemmatizer
import string
import gensim
from gensim import corpora
# from nltk.tokenize import word_tokenize

df = pd.read_json('related_data_rm_duplicacy.json')
QATags = df.content
# print(QATags)
QATags = list(QATags)
# print(QATags[:10])

stop = set(stopwords.words('english'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()
port = PorterStemmer()


def clean(doc):
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    # print(stop_free)
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    # print(punc_free)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    stem = " ".join(port.stem(word) for word in normalized.split())
    remove_non_english = stem.encode("ascii", errors="ignore").decode()
    return remove_non_english


Text_clean = [clean(doc).split() for doc in QATags]
for inputCurr in lstFileNames:
    file_object = open(input_dir + "//" + inputCurr, 'rU')
    try:
        for line in file_object:
            try:
                line.decode('ascii')
            except Exception, error:
                continue ## skip blanks
            # Strip punctuations
            line = StripPunc(line)
            for word in line.split():
                count = count + 1
                # make word lower case and stem  word
                word = word.lower()
                word = PorterStemmer().stem_word(word)
                if word in dictWords: 
                    val = dictWords[word]
                    dictWords[word] = val + 1
                else:
                    dictWords[word] = 1
    finally:
        file_object.close()
############################################################################
#                    Print Summary Statistics                              #
############################################################################   
print "Completed building index of total words seen:\n", count
print "Total unique words after stemming in list:\n", len(dictWords)
keys = dictWords.keys()

###########################################################################
Beispiel #55
0
quotes_token = nltk.word_tokenize(qt)

quotes_bigrams = list(nltk.bigrams(quotes_token))
print(quotes_bigrams)

quotes_trigrams = list(nltk.trigrams(quotes_token))
print(quotes_trigrams)

quotes_quadgrams = list(nltk.ngrams(quotes_token, 4))
print(quotes_quadgrams)

# stemming
from nltk import PorterStemmer

pst = PorterStemmer()
pst.stem("having")
pst.stem("sudeep")

words_stem = ["give", "giving", "given", "gave"]
for words in words_stem:
    print(words + " :" + pst.stem(words))

from nltk import LancasterStemmer

lnst = LancasterStemmer()
for words in words_stem:
    print(words + " :" + lnst.stem(words))

from nltk import SnowballStemmer
best_seller_group = shampoo.groupby('best_selling', )
best_seller_group.agg(['mean', 'std', 'median'])

rating_mask = shampoo['rating'].isnull() == False
rating_group = shampoo.loc[rating_mask, :]
rating_group['rating'] = rating_group['rating'].astype('float')
rating_grouped = rating_group.groupby('best_selling', )

rating_grouped.agg(['mean', 'std', 'median'])

# Natural Language Processing
from nltk.corpus import stopwords
stop = stopwords.words('english')
from textblob import TextBlob
from nltk import PorterStemmer
stemmer = PorterStemmer()
import nltk

df = pd.read_csv('description_df')
df['nlp_description'] = df['nlp_description'].astype('string')

# add product specific stop words
stop.extend([
    'shampoo', 'conditioner', 'soap', 'cleanse', 'hair', 'head', 'shoulders',
    'loréal', 'pari', 'product', 'help', 'use', 'free', 'make', 'type'
])

#Pre Processing
#remove stop words
df['nlp_description'] = df['nlp_description'].apply(
    lambda text: " ".join(word for word in text.split() if word not in stop))
Beispiel #57
0
from nltk import MWETokenizer, sent_tokenize, PorterStemmer
import json
import re
import string
import wiki
import pickle
ingredients = set([])
tokenizer = MWETokenizer()
utensil_tokenizer = MWETokenizer()
method_tokenizer = MWETokenizer()
measurements = set([])
techniques = set([])
ps = PorterStemmer()

mexican = {}
chinese = {}
food = set([])
unnaccounted_methods = [
    "broil", "mix", "grease", "coat", "arrange", "sprinkle"
]
unnaccounted_tools = ["bowl", "dish", "broiler"]
with open('healthy.pickle', 'rb') as handle:
    healthy = pickle.load(handle)


def type(food, foodtypes):
    possibleHits = food.split()
    for h in possibleHits:
        for key in foodtypes.keys():
            if h in key:
                return (food, foodtypes[key])
Beispiel #58
0
 def s(tokens):
     return [PorterStemmer().stem(t) for t in tokens]
import re

from nltk import PorterStemmer

from BugSimilarityScoreCalculator import BugSimilarityScoreCalculator
from DataSetFieldEnum import DataSetFieldEnum
from FinalRank import FinalRank
from RVSMCalculator import RVSMCalculator
from VSMSimilarityCalculator import VSMSimilarityCalculator

first_cap_re = re.compile('(.)([A-Z][a-z]+)')
all_cap_re = re.compile('([a-z0-9])([A-Z])')
porter_stemmer = PorterStemmer()
ranks_file = open('ranks_file.txt', 'w')

class BugLocalization:
    def __init__(self,dataset):
        self.dataset = dataset


    def run(self):
        self.dataset.reset_calculation_lists()
        for i in range(self.dataset.get_bug_report_list_lenght()):
            current_bug_report = self.dataset.bug_report_list[i]
            self.dataset.results = {}
            self.localize_bugs(current_bug_report)

            first_file_pos_ranked = self.calculate_rank_first(self.dataset,current_bug_report)
            files_binary_relevance = self.calculate_binary(current_bug_report,self.dataset)
            top_n_rank = self.calculate_tops(current_bug_report,self.dataset)
def stem(array):
    stemmer = PorterStemmer()
    return [stemmer.stem(w) for w in array]