コード例 #1
0
    def from_csv(self) -> list:
        """Read all csv files from city table in databases dir and
        returns it as list of dicts.

        :return:
        """
        databases_path = Container('').root_path() + '/databases'
        databases_path = Path(databases_path)

        data = []
        if databases_path.is_dir():
            files = databases_path.glob('{}_*.csv'.format(self._table))
            for file_path in files:
                with open(str(file_path), 'r') as file:
                    lines = file.readlines()

                columns = lines.pop(0).replace('\n', '').split(',')
                for line in lines:
                    split = line.replace('\n', '').split(',')
                    dict_ = {}
                    for index, column in enumerate(columns):
                        dict_[column] = split[index]

                    data.append(dict_)

        return data
コード例 #2
0
ファイル: crawler.py プロジェクト: jherrerotardon/spies
    def _storage_items(self):
        """Storage and release memory.

        Save in secure storage items scrapped and
        release self._items list.
        By default, items will be storage in self._storage.

        :return:
        """
        with open((Container()).data_path() + '/' + self._storage,
                  'w') as file:
            pickle.dump(self._items, file=file)
コード例 #3
0
    def download_reviews(self):
        """Downloads reviews and hotel info from entity loaded.

        :return:
        """
        url = self._generate_restaurant_url()
        storage_file = Container().data_path() + '/' + Guid.generate()

        kwargs = {
            'endpoint': self._endpoint['name'].lower(),
            'extractor_name': 'entity',
            'storage': storage_file,
            'entity_id': self._entity['id'],
        }

        Launcher.start_crawler([url], **kwargs)
コード例 #4
0
    def download_restaurants(self, downloads: list):
        """Downloads restaurants using downloads configurations received.

        :param downloads:
        :return:
        """
        for download in downloads:
            url = self._generate_restaurants_url(download)
            storage_file = Container().data_path() + '/' + Guid.generate()

            kwargs = {
                'endpoint': download['name'].lower(),
                'extractor_name': 'restaurants',
                'storage': storage_file,
                'endpoint_id': download['endpoint_id'],
                'city_id': download['city_id'],
            }

            Launcher.start_crawler([url], **kwargs)
コード例 #5
0
    def _get_model_storage(self):
        """Returns file path where model should be stored.

        :return:
        """
        return Path(Container().storage_path() + '/models/' + self.MODEL_FILE)
コード例 #6
0
    def _dataset_file_path(self) -> str:
        """Returns path for data to train model.

        :return:
        """
        return Container().data_path() + '/' + self.DATASET_FILE
コード例 #7
0
import csv
import gzip
import pickle
import re
import string
from io import StringIO
from pathlib import Path

import nltk
from nltk.classify import NaiveBayesClassifier
from nltk.classify.util import accuracy
from nltk.tokenize import RegexpTokenizer
from pyframework.container import Container

# NLTK requirements.
NLTK_RESOURCES_DIR = Container().root_path() + '/.venv/nltk_data'
NLTK_RESOURCES = ['corpora/stopwords']

# Tries to load nltk resource if already has not be loaded.
for resource in NLTK_RESOURCES:
    try:
        nltk.data.find(resource)
    except LookupError:
        nltk.download(resource.split('/')[-1], download_dir=NLTK_RESOURCES_DIR)

from nltk.corpus import stopwords

stopwords_list = set(stopwords.words('english'))


def tokenize_clean_text(text) -> list:
コード例 #8
0
ファイル: crawler.py プロジェクト: jherrerotardon/spies
    def _storage_evidence(self, content):
        path = '{}/{}_{}.html'.format((Container()).data_path(), self.name,
                                      self._page)

        with open(path, 'wb') as file:
            file.write(content)
コード例 #9
0
ファイル: exec.py プロジェクト: jherrerotardon/spies
import sys
from pathlib import Path

from pyframework.container import Container

ack = (Container(str((Path(__file__)).absolute().parent))).run()

sys.exit(ack)
コード例 #10
0
    def _model_path(self) -> str:
        """Returns the path where model must be stored.

        :return:
        """
        return Container().data_path() + '/' + self.MODEL_NAME