Ejemplo n.º 1
0
def tok1(msg):
    lis = []
    li = []
    datatxt = DataTXT(app_id='5d504312af124377bac2f69c908dc20b',
                      app_key='5d504312af124377bac2f69c908dc20b')
    repnews = [
        'news.google.co.in', 'nytimes.com', 'timesofindia.indiatimes.com',
        'wsj.com', 'washingtonpost.com', 'bbc.com', 'moneycontrol.com',
        'economist.com', 'newyorker.com', 'economictimes.indiatimes.com',
        'ndtv.com', 'indiatoday.in', 'indianexpress.com', 'thehindu.com',
        'news18.com', 'firstpost.com', 'dnaindia.com', 'apnews.com',
        'brief.news', 'npr.org', 'scroll.in', 'reuters.com'
    ]
    tokenizer = RegexpTokenizer(r'\w+')
    a = tokenizer.tokenize(msg)
    stop = stopwords.words('english') + list(string.punctuation)
    a = [i for i in a if i not in stop]
    er = EventRegistry(apiKey="e010e4f7-343c-49d5-893d-63d4c2cfd487")
    q = QueryArticlesIter(keywords=QueryItems.OR(a),
                          lang=["eng"],
                          keywordsLoc="title")
    b = q.execQuery(er, sortBy="rel", maxItems=1)
    for article in b:
        if (article['source']['uri'] in repnews):
            if article['title'] not in li:
                lis.append(article['title'])
    for i in range(len(lis)):
        a = datatxt.sim(msg, lis[i])
        if a['similarity'] >= 0.60:
            print(a['similarity'])
            li.append(lis[i])
    return (li)
Ejemplo n.º 2
0
class TestDatatxt(TestCase):
    def setUp(self):
        default_config['app_id'] = os.environ['APP_ID']
        default_config['app_key'] = os.environ['APP_KEY']
        self.datatxt = DataTXT()

    def test_nex(self):
        res = self.datatxt.nex('They say Apple is better than Windows')
        self.assertEqual(
            {annotation.uri for annotation in res.annotations},
            {'http://en.wikipedia.org/wiki/Apple_Inc.',
             'http://en.wikipedia.org/wiki/Microsoft_Windows'}
        )

    def test_sim(self):
        res = self.datatxt.sim(
            'Reports that the NSA eavesdropped on world leaders have "severely'
            ' shaken" relations between Europe and the U.S., German Chancellor'
            ' Angela Merkel said.',
            # --
            'Germany and France are to seek talks with the US to settle a row '
            'over spying, as espionage claims continue to overshadow an EU '
            'summit in Brussels.'
        )

        self.assertGreater(res.similarity, 0.5)

    def test_li(self):
        res = self.datatxt.li("Le nostre tre M sono: mafia, mamma, mandolino")

        self.assertEqual(
            [entry.lang for entry in res.detectedLangs],
            ['it']
        )

        self.assertGreater(res.detectedLangs[0].confidence, 0.9999)

    def test_raises_on_error(self):
        with self.assertRaises(DandelionException):
            self.datatxt.nex(text=None)

    def test_can_set_host(self):
        self.datatxt = DataTXT(host="api.dandelion.eu")
        self.test_nex()

        self.datatxt = DataTXT(host="http://api.dandelion.eu")
        self.test_nex()
def final_score(event, keywords):

	textrazor.api_key = "9dcd16199684c470157ce02dc8ced9357b28f61dd685df6acc8dfd62"
	infocsv = pd.read_csv(event.csv_file.path, header=None)
	print("INFOOOO")
	print(infocsv.shape)
	print(infocsv.iloc[2,2])
	dandelionclient = DataTXT(app_id = '9355e03c7d5e4b879e6af9d8575159d2', app_key = '9355e03c7d5e4b879e6af9d8575159d2')
	# keywords = "reactjs, react.js, redux, React.js"

	a=[]
	output = []

	for count in range(infocsv.shape[0]):
		applicant = Applicant()
		applicant.name = str(infocsv.iloc[count, 0])
		applicant.college = str(infocsv.iloc[count, 1])
		applicant.email = str(infocsv.iloc[count, 2])
		applicant.github_url = str(infocsv.iloc[count,3])
		if(applicant.github_url == "nan"):
			applicant.delete()
			break
		applicant.quora_url = infocsv.iloc[count,4]
		applicant.resume_link = str(infocsv.iloc[count,5])
		applicant.number = infocsv.iloc[count, 6]
		applicant.event = event
		applicant.save()
		print("resume_link")
		print(applicant.resume_link)

		print("RESUME INFO")
		# if __name__ == "__main__":
		words = applicant.resume_link.split('/')
		file_id = words[len(words)-2]
		print("File ID", file_id)
		destination = './' +file_id + '.pdf'
		print("Destination:", destination)
		download_file_from_google_drive(file_id, destination)

		convertapi.api_secret = 'Zgeg7qFLxqDtCAJr'
		result = convertapi.convert('txt', { 'File': './' + file_id + '.pdf' })
		result.file.save('./')

		f1 = open('./' + file_id + '.txt', "r", encoding="utf8")
		resumeinfo = f1.read()
		print(resumeinfo)
		print("="*100)
		try:
			client = textrazor.TextRazor(extractors=["entities", "topics"])
			response = client.analyze(resumeinfo)
			related_keyword_resume=[]
			for topic in response.topics():
				if topic.score>0.7:
					related_keyword_resume.append(topic.label)
			rel_key_resume=', '.join(related_keyword_resume)
			print(rel_key_resume)
			r = dandelionclient.sim(rel_key_resume, keywords, lang="en", bow="one_empty")
			resumesimilarity = r.similarity*25
		except:
			resumesimilarity = 0
		print("--"*100)

		print("QUORA INFO")
		quorainfo = get_user_info_quora(applicant.quora_url)
		print(quorainfo)
		print("="*100)
		if(quorainfo is not ""):
			try:
				client = textrazor.TextRazor(extractors=["topics"])
				response = client.analyze(quorainfo)
				related_keyword_qra=[]
				for topic in response.topics():
					if topic.score>0.7:
						related_keyword_qra.append(topic.label)
				rel_key_quora=', '.join(related_keyword_qra)
				print(rel_key_quora)
				r = dandelionclient.sim(rel_key_quora, keywords, lang="en", bow="one_empty")
				quorasimilarity = r.similarity*15
			except Exception as e:
				print(e)
				quorasimilarity = 0
		else:
			quorasimilarity = 0
		print("--"*100)

		print("GITHUB INFO")
		gitinfo = get_user_info_git(applicant.github_url)[0]
		print(gitinfo)
		print("=="*100)
		try:
			client = textrazor.TextRazor(extractors=["topics"])
			response = client.analyze(gitinfo)
			related_keyword_git=[]
			for topic in response.topics():
				if topic.score>0.7:
					related_keyword_git.append(topic.label)
			rel_key_git=', '.join(related_keyword_git)
			print(rel_key_git)
			print("--"*100)
			r = dandelionclient.sim(rel_key_git, keywords, lang="en", bow="one_empty")
			gitsimilarity = r.similarity*60
		except:
			gitsimilarity = 0
		print("+"*100)
		print(quorasimilarity, resumesimilarity, gitsimilarity)
		a.append(quorasimilarity+resumesimilarity+gitsimilarity)
		applicant.score = a[-1]
		applicant.save()
		output.append(applicant)

	output.sort(key=lambda x: x.score, reverse=True)
	print(a)
	return output
Ejemplo n.º 4
0
    inputFile = inputFile + ".csv"
    colnames = [
        "username", "date", "retweets", "favorites", "text", "geo", "mentions",
        "hashtags", "id", "permalink"
    ]
    with open(inputFile) as csvfile:
        inputReader = pd.read_csv(csvfile, sep="|", error_bad_lines=False)
        textReader = inputReader['text']
        outputFile = inputFile.split('.')[0] + "_results.csv"
        with open(outputFile, "w") as f:
            try:
                for i in range(0, inputReader.shape[0]):
                    query2 = textReader[i]
                    print(query2)
                    try:
                        response = datatxt.sim(query1, query2)
                        if response.similarity > 0.4:
                            f.write(str(i))
                            f.write('|')
                            f.write(query2)
                            f.write('|')
                            f.write(str(inputReader['id'][i]))
                            f.write('|')
                            f.write(str(response.similarity))
                            f.write('\n')
                    except Exception as e:
                        print str(e)
            except dandelion.base.DandelionException as e:
                print "No Units Left"
        f.close()
Ejemplo n.º 5
0
#print lxml.html.tostring(cleaner.clean_html(lxml.html.parse(url)))
clean = cleaner.clean_html(lxml.html.parse(url))
clean = lxml.html.tostring(clean)

soup = BeautifulSoup(clean, 'lxml')
text = soup.get_text()

datatxt = DataTXT(app_id='d40305b7',
                  app_key='7d432531dfb0d3173212d4203f25d4b6')

#response = datatxt.sim(text, "The ultimate skel-ebration of monster mania, this year's Monster High dance will be the monster bash to end all bashes (if it happens)! And as the Monster High ghouls make new beast friends, the horror show really begins. This freaky fabulous new character is larger than unlife at 17 inches tall! And of course, she wears an over-the-tent fashion with lots of ")

paragraphs = list()
match = list()

for line in text.splitlines():
    if len(line) > 20:
        paragraphs.append(line)

paragraphs = paragraphs[0:5]
for p in paragraphs:
    response = datatxt.sim(
        p,
        "The ultimate skel-ebration of monster mania, this year's Monster High dance will be the monster bash to end all bashes (if it happens)! And as the Monster High ghouls make new beast friends, the horror show really begins. This freaky fabulous new character is larger than unlife at 17 inches tall! And of course, she wears an over-the-tent fashion with lots of "
    )
    match.append(response.similarity)

match.sort(reverse=True)
print match
Ejemplo n.º 6
0
class AmazonPipeline(object):
    def __init__(self):
        self.api = API(locale='us')
        self.datatxt = DataTXT(app_id=settings['DANDELION_APP_ID'],
                               app_key=settings['DANDELION_KEY'])

    def process_item(self, item, spider):
        if spider.name in ['ebay_spider', 'amazon_spider']:
            return item
        item['asin'] = []

        if 'upc' in item:
            if item['upc']:
                asin = self.get_upc(item['upc'])
                item['asin'] = asin

        elif 'ean' in item:
            if item['ean']:
                asin = self.get_ean(item['ean'])
                item['asin'] = asin
        elif False and 'mpn' in item and 'brand' in item:
            if item['mpn'] and item['brand']:
                asin = self.search("%s+%s" % (item['mpn'], item['brand']),
                                   item['description'])
                item['asin'] = asin
        elif 'mpn' in item and 'brand' in item:
            if item['mpn'] and item['brand']:
                asin = self.search("%s+%s" % (item['mpn'], item['brand']),
                                   item['description'])
                item['asin'] = asin

        return item

    def get_upc(self, upc):
        response = self.api.item_lookup(upc,
                                        SearchIndex="Blended",
                                        IdType="UPC")

        asin = list()

        for amazon_item in response.Items.Item:
            asin.append(unicode(amazon_item.ASIN.text, 'utf-8'))

        return asin

    def get_ean(self, ean):
        response = self.api.item_lookup(ean,
                                        SearchIndex="Blended",
                                        IdType="EAN")

        asin = list()

        for amazon_item in response.Items.Item:
            asin.append(unicode(amazon_item.ASIN.text, 'utf-8'))

        return asin

    def search(self, keyword, description):
        asin = list()
        try:
            response = self.api.item_search("Blended",
                                            Keywords=keyword,
                                            ResponseGroup="EditorialReview")
        except NoExactMatchesFound:
            return asin

            #if 'response' in locals() and response.results >=1:
            for amazon_item in response:
                # start matching the editorial review
                if hasattr(amazon_item, "EditorialReviews") and hasattr(
                        amazon_item.EditorialReviews, "EditorialReview"):
                    match = self.find_match(
                        description, amazon_item.EditorialReviews.
                        EditorialReview.Content.text)
                    if float(match) > 70.00:
                        asin.append(unicode(amazon_item.ASIN.text, 'utf-8'))
            return asin

    def find_match(self, source, dest):
        paragraphs = list()
        match = list()

        for line in source.splitlines():
            if len(line) > 20:
                paragraphs.append(line)

        paragraphs = paragraphs[0:5]
        try:
            for p in paragraphs:
                response = self.datatxt.sim(p, dest)
                match.append(response.similarity)
        except DandelionException:
            return 0.00

        match.sort(reverse=True)

        return match[0]