Beispiel #1
0
    def process_feed(self):

        for f in self._feed_list:
            self._link = f
            self._fburned = feedparser.parse(self._link)

            # grab the details from the burned feed
            self._furl = self._fburned['url']
            self._fversion = self._fburned['version']
            self._flang = ""

            self._log.info(
                "Processing articles for %s: %s, %s" %
                (self._furl, str(self._fversion), self._flang.strip()))

            self._articles = []

            for i in self._fburned['items']:
                self._aframe = {
                    'title': None,
                    'date': None,
                    'link': None,
                    'keywords': [],
                    'feed': None,
                    'language': None
                }

                try:
                    self._ititle = i['title']
                    #self._isummary = i['summary']
                    self._idate = i['published']
                    self._ilink = i['link']
                    self._ctext = stripper(
                        i['summary']).get_data()  #strip out HTML
                    self._sum = summarize(
                        self._ctext, self._kword_amt)  #summarize the article
                    self._kwords = self._sum.get_most_used_words()
                except Exception, e:
                    print str(e)
                    self._log.error("Failed to process %s" % i['title'])

                self._log.debug(
                    "Processed article: \ntitle:%s \ndate:%s \nlink:%s \nkeywords:%s"
                    % (self._ititle, self._idate, self._ilink, self._kwords))

                self._aframe['title'] = self._ititle
                self._aframe['date'] = self._idate
                self._aframe['link'] = self._ilink
                self._aframe['keywords'] = self._kwords
                self._aframe['feed'] = self._furl
                self._aframe['language'] = self._flang
                hashed = hashlib.sha256(str(self._aframe)).hexdigest(
                )  #hash the contents to check in DB
                self._aframe['hashed'] = hashed
                self._articles.append(self._aframe)

                if self._mongodb_handle._not_processed(hashed):
                    self._log.info("Adding %s (%s)" % (self._ititle, hashed))
                    self._mongodb_handle._insert_full(self._aframe)
Beispiel #2
0
	def process_feed(self):

		for f in self._feed_list:
			self._link = f
			self._fburned = feedparser.parse(self._link)

			# grab the details from the burned feed
			self._furl = self._fburned['url']
			self._fversion = self._fburned['version']
			self._flang = ""

			self._log.info("Processing articles for %s: %s, %s" % (self._furl,str(self._fversion),self._flang.strip()) )

			self._articles = []

			for i in self._fburned['items']:
				self._aframe = {'title':None,'date':None,'link':None,'keywords':[],'feed':None,'language':None}
				
				try:
					self._ititle = i['title']
					#self._isummary = i['summary']
					self._idate = i['published']
					self._ilink = i['link']								
					self._ctext = stripper(i['summary']).get_data() #strip out HTML
					self._sum = summarize(self._ctext,self._kword_amt) #summarize the article
					self._kwords = self._sum.get_most_used_words()
				except Exception,e:
					print str(e)
					self._log.error("Failed to process %s" % i['title'])	
			
				self._log.debug("Processed article: \ntitle:%s \ndate:%s \nlink:%s \nkeywords:%s" % (self._ititle,self._idate,self._ilink,self._kwords))
				
				self._aframe['title'] = self._ititle
				self._aframe['date'] = self._idate
				self._aframe['link'] = self._ilink
				self._aframe['keywords'] = self._kwords
				self._aframe['feed'] = self._furl
				self._aframe['language'] = self._flang
				hashed = hashlib.sha256(str(self._aframe)).hexdigest() #hash the contents to check in DB
				self._aframe['hashed'] = hashed
				self._articles.append(self._aframe)

				if self._mongodb_handle._not_processed(hashed):
					self._log.info("Adding %s (%s)" % (self._ititle,hashed))
					self._mongodb_handle._insert_full(self._aframe)
Beispiel #3
0
	def process_feed(self):

		for f in self._feed_list:
			self._frame = {'feed':None,'version':None,'language':None,'articles':[]}
			self._link,self._lang = f.split(",")
			self._lang = self._lang.strip()
			self._fburned = feedparser.parse(self._link)

			# grab the details from the burned feed
			self._furl = self._fburned['url']
			self._fversion = self._fburned['version']
			self._flang = self._lang

			self._log.debug("Processing articles for %s: %s, %s" % (self._furl,str(self._fversion),self._flang.strip()) )

			self._articles = []

			for i in self._fburned['items']:
				self._aframe = {'title':None,'date':None,'link':None,'keywords':[]}
				
				try:
					self._ititle = i['title']
					#self._isummary = i['summary']
					self._idate = i['published']
					self._ilink = i['link']								
					self._ctext = stripper(i['summary']).get_data() #strip out HTML
					self._sum = summarize(self._ctext,self._kword_amt) #summarize the article
					self._kwords = self._sum.get_most_used_words()
				except Exception,e:
					self._log.error("Failed to process %s" % i['title'])	
			
				self._log.debug("Processed article: \ntitle:%s \ndate:%s \nlink:%s \nkeywords:%s" % (self._ititle,self._idate,self._ilink,self._kwords))
				
				self._aframe['title'] = self._ititle
				self._aframe['date'] = self._idate
				self._aframe['link'] = self._ilink
				self._aframe['keywords'] = self._kwords
				self._articles.append(self._aframe)

			self._frame['feed'] = self._furl
			self._frame['version'] = self._fversion
			self._frame['language'] = self._flang
			self._frame['articles'] = self._articles
Beispiel #4
0
def main2(debug=False, sent_limit=3, lambda_=0.7):
    docs = load_data("../data/database.txt")
    corpus = make_corpus(docs)
    tfidf = TfidfModel()
    model, dictionary = tfidf.generate(corpus)
    dictionary.save_as_text("../data/dict.txt")
    model.save("../data/model.model")

    """
    dictionary = gensim.corpora.Dictionary.load_from_text("../data/dict.txt")
    model = gensim.models.TfidfModel.load("../data/model.model")
    """

    target = read_file("../data/report.txt")
    target_sent, target_corpus = preprocess_target(target)

    indexes = summarize(target_corpus, model, dictionary, sent_limit=sent_limit, lambda_=lambda_)

    for index in sorted(indexes):
        print(target_sent[index])
def main():
    cubeIndex = []
    pyramidIndex = []
    ellipsoidIndex = []
    done = False

    testCase = int(input("What is the test case number?: "))

    # This is a loop that lets the user to keep inputting values until the user types "q" or quit.
    while done == False:
        user = input("Enter Cube/c, Pyramid/p, Ellipsoid/e, Quit/q: ")

        # This ensures that once the user types "quit" or "q", no more inputs can be accepted.
        if user.lower() == "q" or user.lower() == "quit":
            done = True

        # This ensures that once the user types "Cube" or "c", the cube volume obtained from the function in the volume.py will have its value added to the list
        elif user.lower() == "cube" or user.lower() == "c":
            cubeIndex.append(cubeVol())

        elif user.lower() == "pyramid" or user.lower() == "p":
            pyramidIndex.append(pyramidVol())

        elif user.lower() == "ellipsoid" or user.lower() == "e":
            ellipsoidIndex.append(ellipsoidVol())

        # If the user doesn't type in any of the shapes or quit, it will display this message
        else:
            print("Invalid input")

    # If none of the shapes are inputted, display the following message

    if len(cubeIndex) == 0 and len(pyramidIndex) == 0 and len(
            ellipsoidIndex) == 0:
        print(
            "You have reached the end of your session. You did not perform any volume calculations. "
        )

    else:
        cubeOutput = ""
        pyramidOutput = ""
        ellipsoidOutput = ""

    # This ensures that the data will be sorted in increasing order

    cubeIndex = sorted(cubeIndex)
    pyramidIndex = sorted(pyramidIndex)
    ellipsoidIndex = sorted(ellipsoidIndex)

    # If the user does not enter any shapes, it will display this message for that particular shape

    if len(cubeIndex) == 0:
        cubeOutput = "You did not enter a shape."

    if len(pyramidIndex) == 0:
        pyramidOutput = "You did not enter a shape."

    if len(ellipsoidIndex) == 0:
        ellipsoidOutput = "You did not enter a shape."

    # If the user does enter a shape, output the following instructions

    if len(cubeIndex) > 0:
        for x in range(len(cubeIndex)):
            if x != (len(cubeIndex) - 1):
                cubeOutput += (" " + str(cubeIndex[x]) + ",")
            else:
                cubeOutput += (" " + str(cubeIndex[x]))

    # Same process as above

    if len(pyramidIndex) > 0:
        for x in range(len(pyramidIndex)):
            if x != (len(pyramidIndex) - 1):
                pyramidOutput += (" " + str(pyramidIndex[x]) + ",")
            else:
                pyramidOutput += (" " + str(pyramidIndex[x]))

    if len(ellipsoidIndex) > 0:
        for x in range(len(ellipsoidIndex)):
            if x != (len(ellipsoidIndex) - 1):
                ellipsoidOutput += (" " + str(ellipsoidIndex[x]) + ",")
            else:
                ellipsoidOutput += (" " + str(ellipsoidIndex[x]))

    # This will display a message if no shapes are present in the assigned list for a particular shape
    if len(cubeIndex) == 0:
        cubeOutput = "No Shapes Entered."

    if len(pyramidIndex) == 0:
        pyramidOutput = "No Shapes Entered."

    if len(ellipsoidIndex) == 0:
        ellipsoidOutput = "No Shapes Entered."

    # Print functions that displays a summary of the calculations of the shapes inputted
    print(
        "You have reached the end of your session. The volumes calculated for each shape are: "
    )
    print("Cube:", cubeOutput)
    print("Pyramid:", pyramidOutput)
    print("Ellipsoid:", ellipsoidOutput)

    summarize(cubeIndex, pyramidIndex, ellipsoidIndex, testCase)

    return