Ejemplo n.º 1
0
    def setUpClass(cls):
        min_n = 2
        max_n = 3
        max_df = 0.3
        ngram_range = (min_n, max_n)

        date_to = date_to_year_week(pd.to_datetime('today').date())
        date_from = date_to_year_week(pd.to_datetime('1900-01-01').date())

        docs_mask_dict = {}
        docs_mask_dict['filter_by'] = 'union'
        docs_mask_dict['cpc'] = None
        docs_mask_dict['time'] = None
        docs_mask_dict['cite'] = []
        docs_mask_dict['columns'] = None
        docs_mask_dict['date'] = {'to': date_to, 'from': date_from}
        docs_mask_dict['timeseries_date'] = {'to': date_to, 'from': date_from}
        docs_mask_dict['date_header'] = 'publication_date'

        filename = os.path.join('data', 'USPTO-random-100.csv')

        cls.__pipeline = Pipeline(filename,
                                  docs_mask_dict,
                                  ngram_range=ngram_range,
                                  text_header='abstract',
                                  max_df=max_df,
                                  output_name='test',
                                  calculate_timeseries=True)

        cls.__term_score_tuples = cls.__pipeline.term_score_tuples
Ejemplo n.º 2
0
    def test_filter_cpc_A61_intersection_dates(self):
        self.__docs_mask_dict['date'] = {
            'from': date_to_year_week(pd.Timestamp('2010/06/01')),
            'to': date_to_year_week(pd.to_datetime('today'))
        }
        self.__docs_mask_dict['filter_by'] = 'intersection'
        self.__docs_mask_dict['cpc'] = 'A61'

        doc_ids = DocumentsFilter(self.__dates, self.__docs_mask_dict,
                                  self.__cpc_dict, 100).doc_indices

        self.assertListEqual(
            list(doc_ids),
            [67, 69, 72, 74, 43, 81, 50, 85, 57, 90, 60, 94, 63])
Ejemplo n.º 3
0
    def test_filter_dates(self):
        self.__docs_mask_dict['date'] = {
            'from': date_to_year_week(pd.Timestamp('2010/06/01')),
            'to': date_to_year_week(pd.to_datetime('today'))
        }
        doc_ids = DocumentsFilter(self.__dates, self.__docs_mask_dict,
                                  self.__cpc_dict, 100).doc_indices

        self.assertListEqual(list(doc_ids), [
            26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42,
            43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
            60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76,
            77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93,
            94, 95, 96, 97, 98, 99
        ])
Ejemplo n.º 4
0
    def get_docs_mask_dict(self):
        docs_mask_dict = {
            'filter_by': self.args.filter_by,
            'cpc': self.args.cpc_classification,
            'cite': None,
            'columns': self.args.filter_columns,
            'date': None,
            'date_header': self.args.date_header
        }

        if self.args.date_to is not None or self.args.date_from is not None:
            date_to = pd.to_datetime('today').date(
            ) if self.args.date_to is None else pd.to_datetime(
                self.args.date_to)
            date_from = pd.to_datetime(
                '1900-01-01'
            ) if self.args.date_from is None else pd.to_datetime(
                self.args.date_from)
            docs_mask_dict['date'] = {
                'to': date_to_year_week(date_to),
                'from': date_to_year_week(date_from)
            }
        return docs_mask_dict