Ejemplo n.º 1
0
def find_similar_pairs(data):
    """
	Find the most similar document for each document in the collection, output the pairs
	"""
    processor = TextProcessor()
    processor.map_json_data(data)
    similar_pairs = processor.similarity_analysis()
    data_output = {
        processor.doc_collection[f].link: processor.doc_collection[s].link
        for f, s in similar_pairs
    }

    with open('similar_0817.json', 'w') as file_output:
        json.dump(data_output, file_output)
Ejemplo n.º 2
0
class ProcessorTest(unittest.TestCase):
    @classmethod
    def setUpClass(self):
        self.documents = ('The sky is very blue', 'The sun is bright',
                          'The sun in the sky is bright',
                          'We can see the shining sun, the bright SUN')

    def setUp(self):
        self.processor = TextProcessor()

    def test_process_doc(self):
        self.assertEqual(self.processor.process_doc(self.documents[0]),
                         ['sky', 'blue'])
        self.assertEqual(self.processor.process_doc(self.documents[3]),
                         ['see', 'shine', 'sun', 'bright', 'sun'])
        self.assertEqual(self.processor.doc_count, 2)

    def test_gen_matrix(self):
        for doc in self.documents:
            self.processor.doc_collection.append(
                self.processor.process_doc(doc))
        mat = self.processor.gen_matrix()
        print mat
        # verify the generated inverse list
        self.assertEqual(self.processor.inverse_list, {
            'blue': 1,
            'shine': 1,
            'sun': 3,
            'sky': 2,
            'see': 1,
            'bright': 3
        })

        # verify the tf-idf calculation
        expected = [[math.log(4), 0, 0, math.log(2), 0, 0],
                    [0, 0, math.log(4 / 3), 0, 0,
                     math.log(4 / 3)],
                    [0, 0,
                     math.log(4 / 3),
                     math.log(2), 0,
                     math.log(4 / 3)],
                    [
                        0,
                        math.log(4), 2 * math.log(4 / 3), 0,
                        math.log(4),
                        math.log(4 / 3)
                    ]]
        np.testing.assert_array_equal(self.processor.doc_mat, expected)

    def test_consine_similarity(self):
        # the formula is the dot product of d1 and d2 over the product of their euclidean lengths
        d1, d2 = [[1, 0, 2, 4], [0, 3, 2, 1]]
        self.assertEqual(self.processor.consine_similarity(d1, d2),
                         8 / (math.sqrt(21) * math.sqrt(14)))

    def test_get_top_items(self):
        arr = np.array([2, 6, 8, 4, 5, 3])
        np.testing.assert_array_equal(self.processor.get_top_ind(arr, 3),
                                      [2, 1, 4])
Ejemplo n.º 3
0
    def __init__(self, parent):
        Frame.__init__(self)
        self.parent = parent
        self.processor = TextProcessor()

        self.initUI()
Ejemplo n.º 4
0
class Example(Frame):

    def __init__(self, parent):
        Frame.__init__(self)
        self.parent = parent
        self.processor = TextProcessor()

        self.initUI()



    def readCorpus(self, path):
        self.data = Corpus(path)
        self.processor.calculateConditionalFrequency(self.data, self.selCategory.get())
        #self.processor.calculateTotalTermFrequency(self.data)


        self.categoryOption['menu'].delete(0, 'end')
        for attr in self.data.attributes:
            self.categoryOption['menu'].add_command(label=attr, command=lambda v=attr: self.changeCategory(v) )

        self.curdoc=0

        self.txt1.delete('1.0', END)
        self.txt1.insert('1.0', self.data.docs[self.curdoc].text)

    def refreshTextInfo(self):

        if self.selCategory.get() != 'Categories':
            idcat = self.data.attributes.index(self.selCategory.get())
            self.entry1.delete(0, END)
            self.entry1.insert(0, self.data.getAttributeVal(self.curdoc,  self.selCategory.get() ))

        self.txt1.delete('1.0', END)
        self.txt1.insert('1.0', self.data.docs[self.curdoc].text)

        self.applyProcessing()

    def changeCategory(self, value):
        self.selCategory.set(value)
        self.entry1.delete(0, END)
        self.entry1.insert(0, self.data.getAttributeVal(self.curdoc, self.selCategory.get()))
        self.processor.calculateConditionalFrequency(self.data, self.selCategory.get())

    def prevDocument(self):
        if self.curdoc>0:
            self.curdoc-=1
            self.refreshTextInfo()

    def nextDocument(self):
        if self.curdoc<self.data.ndocs-1:
            self.curdoc+=1
            self.refreshTextInfo()

    def popup(self, event):
       print "hello "+str(event.widget)
       self.popupmenu.tk_popup(event.x_root, event.y_root, 0)
       print event.widget.index("@%s,%s" % (event.x, event.y))

    def applyProcessing(self):
        if self.selCategory.get() != 'Categories':
            indxCat = self.data.attributes.index( self.selCategory.get() )
            textResult = self.processor.process(self.data.docs[self.curdoc], indxCat)
        else:
            textResult = ""
        self.txt2.delete('1.0', END)
        self.txt2.insert('1.0', textResult)

    def loadCorpus(self):
	path = tkFileDialog.askdirectory()
        self.readCorpus(path)
	self.refreshTextInfo()

    def hello(self):
        print "Hello"

    def initUI(self):


        self.parent.title("Simple")
        self.pack(fill=BOTH, expand=True)
        self.centerWindow()

        sw = self.parent.winfo_screenwidth()
        sh = self.parent.winfo_screenheight()

        frame1 = Frame(self, relief=RAISED, borderwidth=1)
        frame1.pack(fill=X)

        button1 = Button(frame1, text=u"<", command=self.prevDocument)
        button1.pack(side=LEFT, padx=5, pady=5)

        button2 = Button(frame1, text=u">", command=self.nextDocument)
        button2.pack(side=LEFT, padx=5, pady=5)


        self.selCategory = StringVar(self)
        self.categoryOption = OptionMenu(frame1, self.selCategory, *["Categories"], command=self.changeCategory)
        self.categoryOption.pack(side=LEFT, padx=5, pady=5)

        self.entry1 = Entry(frame1)
        self.entry1.pack(side=LEFT, padx=5, pady=5)

        self.ignoreActualDocVar = IntVar(self)

        checkButton1 = Checkbutton(frame1, text="Ignored", variable=self.ignoreActualDocVar)
        checkButton1.pack(side=LEFT, padx=5, pady=5)

        button3 = Button(frame1, text=u"Save document", command=self.prevDocument)
        button3.pack(side=LEFT, padx=5, pady=5)

        #entry1 = Entry(frame1)
        #entry1.pack(fill=X, padx=5, expand=True)




        frame2 = PanedWindow(self, orient=HORIZONTAL)
        frame2.pack(fill=BOTH, expand=1)

        self.txt1 = Text(frame2, width=sw/22)
        frame2.add(self.txt1)

        self.txt2 = Text(frame2)
        self.txt2.bind("<Button-3>", self.popup)      
        frame2.add(self.txt2)






        frame3 = Frame(self, relief=RAISED, borderwidth=1)
        frame3.pack(fill=X)

        #lbl3 = Label(frame3, text="Author", width=6)
        #lbl3.pack(side=LEFT, padx=5, pady=5)

        #entry3 = Entry(frame3)
        #entry3.pack(fill=X, padx=5, expand=True)

        self.swVar = IntVar(self)
        checkButton1 = Checkbutton(frame3, text="Remove stop words", variable=self.swVar)
        checkButton1.pack(side=LEFT, padx=5, pady=5)

        self.lowerVar = IntVar(self)
        checkButton1 = Checkbutton(frame3, text="Convert to lower case", variable=self.lowerVar)
        checkButton1.pack(side=LEFT, padx=5, pady=5)

        button3 = Button(frame3, text=u"Apply", command=self.applyProcessing)
        button3.pack(side=LEFT, padx=5, pady=5)

        #self.readCorpus()
        


	# create a toplevel menu
	menubar = Menu(self)


        filemenu = Menu(menubar, tearoff=0)
        filemenu.add_command(label="Quit", command=self.parent.quit)
        filemenu.add_command(label="Open corpus", command=self.loadCorpus)
        menubar.add_cascade(label="Project", menu=filemenu)
	#menubar.add_command(label="Quit!")  # , command=root.quit

	# display the menu
	self.parent.config(menu=menubar)

	self.popupmenu = Menu(self.parent, tearoff=0)
	self.popupmenu.add_command(label="Undo", command=self.hello)
	self.popupmenu.add_command(label="Redo", command=self.hello)


    def centerWindow(self):

        sw = self.parent.winfo_screenwidth()
        sh = self.parent.winfo_screenheight()

        w = sw/1.5
        h = sh/1.5

        x = (sw - w) / 2
        y = (sh - h) / 2
        self.parent.geometry('%dx%d+%d+%d' % (w, h, x, y))
Ejemplo n.º 5
0
        "DATA_PATH": str,
        "session_num": 13,
    }

    if os.environ["HOME"] == "/root":
        args["DATA_PATH"] = "/content/gdrive/MyDrive/bert-for-hmltc/data"
    else:
        args["DATA_PATH"] = "data"

    random.seed(args["seed"])
    np.random.seed(args["seed"])
    torch.manual_seed(args["seed"])

    logger.info("Initializing…")
    tokenizer = load_tokenizer(args)
    processor = TextProcessor(args, tokenizer, logger, "topic_list.json")

    if args["use_parents"]:
        model = create_experimental(args, len(processor.labels))
    else:
        model = create_baseline(args, len(processor.labels))

    model_state_dict = torch.load(
        join(args["DATA_PATH"], "model_files/13_finetuned_pytorch_model.bin"),
        map_location="cpu",
    )
    model.load_state_dict(model_state_dict)
    if args["do_train"]:
        trainer = ModelTrainer(args, model, logger)

        logger.info("Loading data…")
Ejemplo n.º 6
0
from flask import Flask, render_template, request
from processor import TextProcessor, Document
import re
from bs4 import BeautifulSoup
import urllib3
import numpy as np
from flask.ext.pymongo import PyMongo
import os
import sys

app = Flask(__name__)
app.config['MONGO_URI'] = os.getenv('MONGOHQ_URL')
mongo = PyMongo(app)
processor = TextProcessor()

with app.app_context():
    processor.map_data(mongo.db.postings.find())
    processor.build_doc_matrix()


@app.route('/', methods=['GET', 'POST'])
def main():
    if request.method == 'GET':
        return render_template('index.html')

    url = request.form['url'].strip()
    domain = 'newyork.craigslist.org/'

    if not domain in url:
        return render_template('index.html', error='Please enter a valid URL')
Ejemplo n.º 7
0
 def setUp(self):
     self.processor = TextProcessor()