Example #1
0
	def insertResultsLinkDictIntoDB(self, resultsLinkDict, googleSearchQueryObj, printing, printingDebug):

		if self.conn!=None and type(self.dbTableName)==type(""):
			if printing:
				print("\n\t\t\tTrying to insert results obtained...")
			try:
				resultCount = 0		## gives priority of results. ResultNumber = 1 means the top result
				repeatCount = 0
				uniqueResultNumber=0

				if printingDebug:
					print("\n\n\nresultsLinkDict=\n%s\n\n"%resultsLinkDict)
					print("sorted(resultsLinkDict.keys()) = %s\n\n\n"%sorted(resultsLinkDict.keys()))

				for pageNum in sorted(resultsLinkDict.keys()):
					for resultNumberOnPage in range(1, len(resultsLinkDict[pageNum])+1):
						## Insert the url into the database.
						resUrl = resultsLinkDict[pageNum][resultNumberOnPage-1]	## We always increment the result number to show where it would be on the page, even in case of duplicates
						resultCount += 1
						try:
							topic = googleSearchQueryObj.getTopicStringForDB()
							startDate = googleSearchQueryObj.getDaterangeFrom()
							endDate = googleSearchQueryObj.getDaterangeTO()

							sqliteDefaults.insert_table_sqlite(self.conn, self.dbTableName,
															   ('resultNumberInSearch','URL', 'Topic','ResultPageNumber', 'ResultNumberOnPage','StartDate',	'EndDate', 	'SearchedOnDate', 				'ObtainedFromQuery'),
															   [(resultCount, 			resUrl,	topic,	pageNum, 			resultNumberOnPage,	 startDate, 	endDate, 	datetime.datetime.now().date(),	googleSearchQueryObj.toString() )
																],
															   printing_debug = printingDebug

															   )

							uniqueResultNumber += 1


						except Exception, e:
							errorString = str(e).lower()
							if errorString.find('syntax error')==-1:
								repeatCount += 1
							if printing:
								print("\n\t\t\t\tCould not insert topic-url pair ( %s  ,  %s )"%(topic, resUrl))
								print("\t\t\t\tError description: %s\n"%e)
							if printingDebug:
								traceback.print_stack()


				if printing:
					print("\n\t\t\t\tNumber of URLs repeated: (%s/%s)"%(repeatCount,resultCount))
					print("\t\t\t\tNumber of unique URLs inserted: (%s/%s)\n"%(uniqueResultNumber, resultCount))

			except Exception, e:
				if printing:
					print("\t\t\tERROR in GoogleSearch._insertResultsLinkDictIntoDB(): Cannot insert results extracted so far into database.\n")
					print("\n\t\t\tERROR description: %s"%e)
				if printingDebug:
					print("\n\t\t\tPrinting stack traceback:\n")
					traceback.print_stack()
					print("\n\n\n")
		traceback.print_exc()

		try:
			print "\n\n\n\tTrying to insert results obtained so far into...\n"

			repeat_count=0
			result_number=1		## gives priority of results. ResultNumber = 1 means the top result
			unique_result_number=1
			for page_num in results_link_dict:
				for res_url in results_link_dict[page_num]: 
						## Insert the url into the database.

						try :
							sqliteDefaults.insert_table_sqlite(conn, 
								db_table_name, 
								('URL',	 	'Topic',	'StartDate',	'EndDate', 	'ResultPageNumber', 'ResultNumber'), 
								[(res_url,	 topic, 		start_date, 	end_date, 	page_num, result_number)]
							)
							unique_result_number+=1
						except Exception:
							print "\t\tCould not insert topic-url pair ( %s  ,  %s ), possible duplicate"%(topic, res_url)
							repeat_count=repeat_count+1

						result_number=result_number+1	## We always increment te result number to show where it would be on the page, even in case of duplicates
			print "\n\n\tNumber of URLs repeated in this search = %s"%(repeat_count)
			print "\tNumber of URLs extracted in this search = %s"%(unique_result_number-1)

		except Exception:
			print "\n\t\tERROR: Cannot insert results extracted so far into database.\n"

		exit()
Example #3
0
        traceback.print_exc()

        try:
            print "\n\n\n\tTrying to insert results obtained so far into...\n"

            repeat_count = 0
            result_number = 1  ## gives priority of results. ResultNumber = 1 means the top result
            unique_result_number = 1
            for page_num in results_link_dict:
                for res_url in results_link_dict[page_num]:
                    ## Insert the url into the database.

                    try:
                        sqliteDefaults.insert_table_sqlite(
                            conn, db_table_name,
                            ('URL', 'Topic', 'StartDate', 'EndDate',
                             'ResultPageNumber', 'ResultNumber'),
                            [(res_url, topic, start_date, end_date, page_num,
                              result_number)])
                        unique_result_number += 1
                    except Exception:
                        print "\t\tCould not insert topic-url pair ( %s  ,  %s ), possible duplicate" % (
                            topic, res_url)
                        repeat_count = repeat_count + 1

                    result_number = result_number + 1  ## We always increment te result number to show where it would be on the page, even in case of duplicates
            print "\n\n\tNumber of URLs repeated in this search = %s" % (
                repeat_count)
            print "\tNumber of URLs extracted in this search = %s" % (
                unique_result_number - 1)

        except Exception:
	temp_list = list(sorted([j for j in company_dict]))
	for j in temp_list:
		if j >= 100:
			print "%s : %s"%(j, company_dict[j])
	


	while True:
		select = raw_input("\n>")
		if not select:
			ensure_not_double_query = "SELECT * from articles_clean where article_url='%s'"%(articles[i][0])
			if len(sqliteDefaults.verified_select_sqlite(conn,ensure_not_double_query)) == 0:

				sqliteDefaults.insert_table_sqlite(conn, 
					'articles_clean',
					('company_or_sector', 'article_url'),
					[ (articles[i][1], articles[i][0]) ]
				)
				
				i+=1		## Go to next entry
			else:
				print "ERROR #1: pair already exists in table"
				y = raw_input()
				
			break;
	




		elif select.lower() == 'p' or select.lower() == 'b':