예제 #1
0
파일: arxiv.py 프로젝트: TychoA/Arxiv
class Arxiv:
    def __init__(self, filename, amount, language='english'):
        try:
            self.text = open(filename, 'rb').read().decode('latin-1')
        except:
            return 'Something went wrong. Probably something with the codec.'

        # Set base variables
        self.language = language
        self.search_term = phrase_finder(tokenize(self.text, self.language),
                                         n=5,
                                         gram='bigram')

        # Initialize api class
        self.api = API(self.search_term)

        # Get the data
        self.api.get(amount)

    def dump(self):
        '''
			Returns pretty soupified web page.
		'''
        return self.api.data.prettify()

    def summary(self):
        '''
			Returns the first 15 sentences of the input document.
			This amount is around the size of a short introduction.
		'''
        words = self.text.split('.')[5:20]
        return '.'.join(words)

    def similars(self, lvl, sort, *args):
        '''
			Funcion that loops through articles,
			retrieves the data specified by sort.

			-- Variables --
			@lvl is the minimum level of similarity. Int between 0 and 1.
			@sort is the content where the similarity is based on. String of 'full', 'phrase', 'summary'
			@args are additional parameters for adding extra values to the final collection.
				- Such as author, journal etc.
			
			returns
			@dictionary  similarities with titles.
		'''
        local = self.text
        others = (article for article in self.api.collect(
            'url', 'summary', args).iteritems())

        # Set for later use
        self.fields = [
            field for field in self.api.flatten(
                [args, 'similarity', 'summary', 'url'])
        ]
        self.similars = defaultdict(dict)

        # Loop through articles
        for article in others:
            info = {}

            # Specify content on which to calculate similarity
            if sort == 'full':
                url = article[1]['url']
                pdf = PDF(url)
                try:
                    external = pdf.parse()
                    info['similarity'] = cosine_sim(local, external)
                except:
                    return 'PDFTextExtractionNotAllowed is raised for %s' % url

            elif sort == 'summary':
                summarized = self.summary()
                external = article[1]['summary']
                info['similarity'] = cosine_sim(summarized, external)

            elif sort == 'phrase':
                external = phrase_finder(tokenize(article[1]['summary'],
                                                  self.language),
                                         n=5,
                                         gram='bigram')
                info['similarity'] = cosine_sim(self.search_term, external)

            # Add additional attributes
            if args:
                for arg in self.api.flatten(args):
                    info[arg] = article[1][arg]

            info['url'] = article[1]['url']
            info['summary'] = article[1]['summary']

            if info['similarity'] > lvl:
                self.similars[article[0]] = info

        # Return collection
        return self.similars

    def to_csv(self):
        '''
			Writes data retrieved from similars to csv file.
		'''
        # See if the data already exists
        try:
            data = self.similars
        except AttributeError:
            return 'Make sure to run similars first, to create the data.'

        # Create new csv file with search term as name
        with open(self.search_term + '.csv', 'wb') as c:

            # Initialize fields and add title for keys
            fields = self.fields
            fields.insert(0, 'title')

            # Init base values for csv file
            writer = csv.DictWriter(c, fieldnames=fields, delimiter=',')
            writer.writeheader()

            # Write rows
            for key, val in data.iteritems():
                row = {'title': key}
                row.update(val)
                writer.writerow(row)

        # Return
        return 'A csv file has been created as %s.' % (self.search_term +
                                                       '.csv')