Example #1
0
def clean(text):
    """
    returns a list of words, from text, removing unnecessary punctuation and
    stopwords
    """
    from stopwords import stopwords
    import string

    # trimming stopwords:
    stops = stopwords()
    text = text.split()
    ret = []
    for word in text:
        if (word not in stops) and (len(word) > 1):
            ret.append(word)

    # trimming punctuation:
    punc = string.punctuation
    for i in range(len(ret)):
        og = ret[i]
        if og[0] in punc: og = og[1:]
        if og[-1] in punc: og = og[:-1]
        ret[i] = og

    return ret
Example #2
0
def index(request):
    global swords
    if len(swords) == 0:
        swords = set(stopwords("cache"))
        swords.add("rlq")
        swords.add("u")
        swords.add("wp")
        print "stopwords fixed!"
    inCache = IndexedUrl.objects.all()
    return render(request, 'query/index.html', {'inCache': inCache})
Example #3
0
 def __init__(self):
     '''
   self.saver    : Pickles the object on to the disk.
   self.ps       : Porter stemmer class object. It is required to get stem of a word.
   self.st       : Class object to check for stop words.
 '''
     self.saver = save_object()
     self.ps = PorterStemmer()
     self.st = stopwords()
     self.m_ds = {}
     self.models = {}
     self.m_mod = model()
Example #4
0
def main(csv_file=None, number_of_articles=None):
    """
    Main method.

    Purpose:
        Aggregator and a central controller to control all the modules
    Working:
        >> main(csv_file='~/suyash/Sample.csv')
    """
    working_string = csv_to_str.csv_2_str(csv_file=csv_file)
    stopword_set = stopwords.stopwords()
    print nltk_stemmer.word_tokenizer(
        string_to_tokenize=working_string,
        stopset=stopword_set,
        top_number_of_articles=number_of_articles)
def search(path_to_index, queries):
    a = os.path.join(path_to_index, 'inverted_index')
    from stemmer import stemmer
    from stopwords import stopwords
    from search_handler import search
    stemmer = stemmer()
    stopwords = stopwords()
    search = search(a, stemmer, stopwords)
    outputs = []
    for query in queries:
        the_result = search.search(query)
        if the_result is None:
            outputs.append(['.'] * 10)
        else:
            outputs.append(the_result)
    return outputs
Example #6
0
def search(request):
    global swords
    if len(swords) == 0:
        swords = set(stopwords("cache"))
        swords.add("rlq")
        swords.add("u")
        swords.add("WP")
        print "stopwords fixed!"
    inCache = IndexedUrl.objects.all()
    backupnumDocs = 0
    backup = []
    qaug = ""
    try:
        # if True:
        q = request.GET.get('q', None)
        try:
            k = int(request.GET.get('k', None))
        except:
            k = 0
        print "searching ", q, ", ", k
        rankedfiles = SearchFiles(q)
        if k != 0:
            for docID in rankedfiles:
                backup.append(Docs.objects.get(num=int(docID)))
            backupnumDocs = len(backup)
            qaug = queryenhancement(q, swords, k, rankedfiles, "cache",
                                    inCache[0].number)
            # print qaug
            rankedfiles = SearchFiles(q + qaug)
        url_title = []
        for docID in rankedfiles:
            url_title.append(Docs.objects.get(num=int(docID)))
        numDocs = len(url_title)
        return render(
            request, 'query/present.html', {
                'docs': url_title,
                'q': q,
                'qaug': qaug,
                'numDocs': numDocs,
                'inCache': inCache,
                'backup': backup,
                'backupnumDocs': backupnumDocs
            })
    except:
        return render(request, 'query/error.html', {'inCache': inCache})
Example #7
0
def fix(request):
    global swords
    print "start fixing"
    swords = set(stopwords("cache"))
    swords.add("rlq")
    swords.add("u")
    swords.add("wp")
    fp = open("stopwords.txt", "w")
    fp.write("<table>\n<tr>\n")
    i = 0
    for w in swords:
        fp.write(" <td> %s </td> " % w)
        i += 1
        if (i % 8 == 0):
            fp.write("</tr>\n<tr>")

    fp.write("</tr>\n</table>")
    fp.close()
    return HttpResponse("StopWords fixed!")
Example #8
0
def Tokens(filename):
    "对初始文本进行分词、去停用词,返回分词列表"
    # 去除文本空格、空行
    with open(filename, encoding='utf=8') as f_obj:
        lines = f_obj.readlines()
        contents = ''
        for line in lines:
            contents = contents + line.strip()

    # 分词
    text = jieba.lcut(contents)

    # 导入停用词表
    stopword_list = stopwords.stopwords()

    # 去停用词
    seg = []
    for word in text:
        if word not in stopword_list:
            seg.append(word)
    return seg
def loginFormHandling():
    from stemmer import stemmer
    from stopwords import stopwords
    from search_handler import search
    stemmer = stemmer()
    stopwords = stopwords()
    search = search("path", stemmer, stopwords)
    data = request.form
    query = request.form['query']
    flag = 0
    if data['title'] != '':
        flag = 1
        query += " title:"
        query += data['title']
    if data['infobox'] != '':
        flag = 1
        query += " infobox:"
        query += data['infobox']
    if data['references'] != '':
        flag = 1
        query += " ref:"
        query += data['references']
    if data['category'] != '':
        flag = 1
        query += " category:"
        query += data['category']
    if data['links'] != '':
        flag = 1
        query += " links:"
        query += data['links']
    if data['body'] != '':
        flag = 1
        query += " body:"
        query += data['body']
    # if (flag == 1):
    # 	r = len(request.form ['query'])
    # 	query = query[r:]
    print(query)
    the_result = search.search(query)
    return render_template('results.html', query=query, results=the_result)
Example #10
0
def lambda_handler(event, context):

    book = event['queryStringParameters']['book']
    query = event['queryStringParameters']['query']
    s3 = boto3.resource('s3')
    wordMap = pickle.loads(
        s3.Bucket("pdftags").Object(book + ".pickle").get()['Body'].read())

    # with BytesIO() as data:
    #     s3.Bucket("pdftags").download_fileobj(book+".pkl", data)
    #     data.seek(0)    # move back to the beginning after writing
    #     wordMap = pickle.load(data)

    keywords = query.split()
    match = {}
    for keyword in keywords:
        keyword = keyword.lower()
        if (keyword in stopwords.stopwords().words):
            continue
        for head in wordMap:
            wordMap[head].text = wordMap[head].text.lower()
            headWords = wordMap[head].text.split()
            if (keyword in wordMap[head].text):
                match[head] = match.get(head, 0) + 1

    sort_match = sorted(match.items(),
                        key=lambda x: wordMap[x[0]].fontSize,
                        reverse=True)
    sort_match = sorted(sort_match, key=lambda x: x[1], reverse=True)

    return {
        'statusCode': 200,
        'headers': {
            'Content-Type': 'application/json'
        },
        'body': json.dumps(sort_match)
    }
Example #11
0
def crawl(request):
    inCache = IndexedUrl.objects.all()
    start = request.POST['start']
    number = int(request.POST['number'])
    domain = request.POST['domain']
    global swords
    try:
        fHash = BuildSearchEngine(start, number, domain)
        for q in IndexedUrl.objects.all():
            q.delete()
        for q in Docs.objects.all():
            q.delete()
        '''for q in StopWords.objects.all():
            q.delete()'''

        log = IndexedUrl.objects.create(start=start,
                                        number=number,
                                        domain=domain,
                                        timestamp=timezone.now())
        log.save()
        for num in fHash:
            url, title = fHash[num]
            doc = Docs.objects.create(num=num, url=url, title=title)
            doc.save()
        swords = set(stopwords("cache"))
        '''for w in stopwords("cache"):
            word = StopWords.objects.create(word=w)
            word.save()'''
        # print fHash
        return render(request, 'query/results.html', {
            'start': start,
            'number': num + 1,
            'domain': domain
        })
    except:
        return render(request, 'query/error.html', {'inCache': inCache})
                session.add(term)

            frequency = session.query(TermFrequency).\
              filter(TermFrequency.document == doc, TermFrequency.term == term).first()
            if not frequency:
                frequency = TermFrequency(document=doc, term=term)
                session.add(frequency)
            else:
                frequency.frequency += 1

    session.flush()
    session.commit()


if __name__ == '__main__':
    stopword_list = stopwords()

    def document_terms():
        for filepath, content, date in documents():
            print(filepath)

            extension = path.splitext(filepath)[1]

            words = None
            title = filename(filepath)

            if extension in ['.html', '.htm', '.jspy']:
                html_title, words, links = tokenize_html(content)
                html_title = html_title.strip()
                if html_title:
                    title = html_title
Example #13
0
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
url = 'https://www.bbc.com/news/uk-51991887'


stopword=[]
freq=[]
word=[]

response = urllib.request.urlopen(url)
html = response.read()
soup=BeautifulSoup(html)
text=soup.get_text(strip=True)
fullwordlist= st.stripNonAlphaNum(text)
stopword=st.stopwords()

# this algorithmn method still has some problem so i show it as comment first
# wordList=st.removeStopWord(fullwordlist,stopword)

wordlist =st.remove(fullwordlist,stopword)
dictionary = st.wordListToFreqDict(wordlist)
sorteddict = st.sortFreqDict(dictionary)

for t in sorteddict:
   freq.append(t[0])

for i in sorteddict:
    word.append(i[1])

print(sorteddict)
Example #14
0
# -*- coding: utf-8 -*-

from thinkbayes import Pmf
import string
from stopwords import stopwords

#initialze constructor
pmf = Pmf()

#generate word list from text
fname = 'data/aristotle.txt'
words = []
stopwords = stopwords('data/stopwords.txt')
with open(fname) as f:
	for line in f:
		processed = line.strip().translate(None, string.punctuation).split()
		for word in processed:
			if word.lower() in stopwords:
				continue
			words.append(word.lower())

#generate unique word set
u_words = set(words)

#count words
for word in words:
	pmf.Incr(word, 1)

#normalize word freqeuncies as probabilities
pmf.Normalize()
Example #15
0
        term = Term(text = term_text)
        session.add(term)
      
      frequency = session.query(TermFrequency).\
        filter(TermFrequency.document == doc, TermFrequency.term == term).first()
      if not frequency:
        frequency = TermFrequency(document = doc, term = term)
        session.add(frequency)
      else:
        frequency.frequency += 1
  
  session.flush()
  session.commit()

if __name__ == '__main__':
    stopword_list = stopwords()
    
    def document_terms():
      for filepath, content, date in documents():
        print(filepath)
        
        extension = path.splitext(filepath)[1]
        
        words = None
        title = filename(filepath)
        
        if extension in ['.html', '.htm', '.jspy']:
          html_title, words, links = tokenize_html(content)
          html_title = html_title.strip()
          if html_title:
            title = html_title
Example #16
0
# - Site HTML

# Imports
from re import findall
from os import listdir
from collections import defaultdict, Counter
from math import sqrt
from datetime import datetime
from pdb import set_trace

from pymongo import MongoClient
from stopwords import stopwords
from bs4 import BeautifulSoup

# setup
stopwords = stopwords()
directory = "/Users/mruttley/Documents/2015-04-22 AdGroups/Bucketerer/data_crunching/alexa_data/"
html_directory = "/Users/mruttley/Documents/2015-05-11 StatCounter HTML Crawler/html/"
ranking_directory = "/Users/mruttley/Documents/2015-04-22 AdGroups/Bucketerer/data_crunching/ranking_files/"
verbose = False

# Auxiliary functionality


def get_ranking():
    """Gets a ranking of domains"""
    timestamp = datetime.strftime(datetime.now(), "%Y-%m-%d")
    filename = timestamp + "top-1m.csv"
    domains = []
    with open(ranking_directory + filename) as f:
        for line in f:
Example #17
0
# - Site HTML

#Imports
from re import findall
from os import listdir
from collections import defaultdict, Counter
from math import sqrt
from datetime import datetime
from pdb import set_trace

from pymongo import MongoClient
from stopwords import stopwords
from bs4 import BeautifulSoup

#setup
stopwords = stopwords()
directory = '/Users/mruttley/Documents/2015-04-22 AdGroups/Bucketerer/data_crunching/alexa_data/'
html_directory = "/Users/mruttley/Documents/2015-05-11 StatCounter HTML Crawler/html/"
ranking_directory = "/Users/mruttley/Documents/2015-04-22 AdGroups/Bucketerer/data_crunching/ranking_files/"
verbose = False

#Auxiliary functionality


def get_ranking():
    """Gets a ranking of domains"""
    timestamp = datetime.strftime(datetime.now(), '%Y-%m-%d')
    filename = timestamp + "top-1m.csv"
    domains = []
    with open(ranking_directory + filename) as f:
        for line in f:
def __pipeline(classifier,
               train,
               test,
               train_y,
               test_y,
               scoring,
               task='train'):
    cls_name = classifier.__class__.__name__
    print(f'#####  {cls_name}:{task}  #####')
    timer = Timer()
    timer.start()
    pipe = Pipeline([
        ('features',
         FeatureUnion([
             ('tfidf',
              TransformPipeline([('extract',
                                  ColumnExtractor(col='text', as_type=str)),
                                 ('vector',
                                  TfidfVectorizer(analyzer='word',
                                                  tokenizer=__tokenizer,
                                                  preprocessor=__preprocessor,
                                                  stop_words=stopwords(),
                                                  min_df=10,
                                                  sublinear_tf=True))])),
             ('topic',
              TransformPipeline([('extract',
                                  ColumnExtractor(col='text', as_type=str)),
                                 ('vector',
                                  CountVectorizer(analyzer='word',
                                                  tokenizer=__tokenizer,
                                                  preprocessor=__preprocessor,
                                                  stop_words=stopwords(),
                                                  min_df=10)),
                                 ('lda',
                                  TransformLatentDirichletAllocation(
                                      n_components=8,
                                      max_iter=3,
                                      learning_method='online',
                                      learning_offset=10.,
                                      n_jobs=4,
                                      random_state=__random_state))])),
             ('author', PrefixColumnExtractor(prefix='author_', as_type=int)),
             ('token_count',
              TransformPipeline([('extract',
                                  ColumnExtractor(col='token_count',
                                                  as_type=int,
                                                  as_matrix=True)),
                                 ('scale', MaxAbsScaler())])),
             ('token_length_mean',
              TransformPipeline([('extract',
                                  ColumnExtractor(col='token_length_mean',
                                                  as_type=float,
                                                  as_matrix=True)),
                                 ('scale', MaxAbsScaler())])),
             ('char_count',
              TransformPipeline([('extract',
                                  ColumnExtractor(col='char_count',
                                                  as_type=int,
                                                  as_matrix=True)),
                                 ('scale', MaxAbsScaler())])),
             ('digit_char_ratio',
              ColumnExtractor(col='digit_char_ratio',
                              as_type=float,
                              as_matrix=True)),
             ('alpha_char_ratio',
              ColumnExtractor(col='alpha_char_ratio',
                              as_type=float,
                              as_matrix=True)),
             ('upper_char_ratio',
              ColumnExtractor(col='upper_char_ratio',
                              as_type=float,
                              as_matrix=True)),
             ('space_char_ratio',
              ColumnExtractor(col='space_char_ratio',
                              as_type=float,
                              as_matrix=True)),
             ('punctuation_char_ratio',
              ColumnExtractor(col='punctuation_char_ratio',
                              as_type=float,
                              as_matrix=True))
         ])), ('model', classifier)
    ])
    # print(f'pipeline steps={repr(pipe.steps)}')
    if task == 'test':
        __train(pipe, train, train_y)
        __test(pipe, test, test_y)
    elif task == 'train':
        __train(pipe, train, train_y)
    elif task == 'tune':
        param_grid = {
            # 'features__token_length_mean__scale': [None, MaxAbsScaler()]
            # 'features__tfidf__vector__min_df': [1, 10, 100]
            # 'features__topic__lda__n_components': [8, 16, 32]
            # 'features__topic__lda__max_iter': [3, 6, 10]
            # 'model__penalty': ['l2', 'l1', 'elasticnet']
            # 'model__alpha': [0.0001, 0.001, 0.01]
            # 'model__loss': ['squared_loss', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive']
            'model__max_iter': [1000, 40000]
        }
        __grid_search(pipe, param_grid, train, train_y, scoring=scoring)
    elif task == 'validate':
        __validate(pipe, train, train_y, scoring)
    else:
        raise ValueError(f'Invalid value: task={task}')
    timer.stop()
    print(
        f'__pipeline {cls_name}:{task} took {seconds_to_hhmmss(timer.elapsed)}'
    )