Ejemplo n.º 1
0
    def getXml(self, dst_dir=None):
        """Get XML file from row values and copy to xml_dir

        :param dst_dir: xml_dir to copy XML file
        :type dst_dir: string
        :return: True or False
        :rtype: boolean

        """
        if self.md_file and dst_dir:
            dst_file = os.path.join(dst_dir, os.path.basename(self.md_file))
            helpers.getFile(self.md_file, dst_file, self.disable_ssl_verify)
Ejemplo n.º 2
0
    def readInputFile(self, site, inputType):
        results = []

        fileName = ''

        # the command line parameter takes priority
        if self.options['inputKeywordsFile']:
            fileName = self.options['inputKeywordsFile']
        else:
            siteName = site.get('name').lower()

            if inputType == 'search terms':
                fileName = self.keywordsFiles.get(siteName, '')
            else:
                fileName = self.idListFiles.get(siteName, '')

        if not fileName:
            logging.error(f'No {inputType} file specified for {siteName}.')
            input("Press enter to continue...")

        logging.info(f'Using {inputType} file: {fileName}')

        file = helpers.getFile(fileName)

        for line in file.splitlines():
            results.append(line)

        if not results:
            logging.error('No search terms or ID\'s found')
            input("Press enter to continue...")

        return results
Ejemplo n.º 3
0
def run():
    # if args.file:
    #     pass
    # else:
    # Get list of nodes files from config node_dir
    nodes_files = [f for f in os.listdir(config['MAIN']['nodes_dir']) if os.path.isfile(os.path.join(config['MAIN']['nodes_dir'], f))]

    count_file = 1
    for nodes_file in nodes_files:
        log.log('', 'INFO', 0)
        log.log(color.OKGREEN + u'*' * 80 + color.ENDC, 'INFO', 0)
        log.log(color.OKGREEN + u'File ' + str(count_file) + '/' + str(len(nodes_files)) + ': ' + nodes_file + '.' + color.ENDC, 'INFO', 0)
        if nodes_file.startswith('_'):
            log.log(u'File skipped.', 'INFO', 0)
        else:
            file_path = os.path.join(config['MAIN']['nodes_dir'], nodes_file)
            with open(file_path, 'r') as json_data:
                data = json.load(json_data)

            log.log(color.BOLD + u'Organisme: ' + data['organisme'] + color.ENDC, 'INFO', 0)

            count_node = 1
            for node in data['nodes']:
                log.log('', 'INFO', 0)
                log.log(color.OKBLUE + u'-' * 80 + color.ENDC, 'INFO', 0)
                log.log(color.OKBLUE + u'Node ' + str(count_node) + '/' + str(len(data['nodes'])) + color.ENDC, 'INFO', 0)
                log.log(u'Description: ' + node['description'], 'INFO', 0)
                if node['active'] == '0':
                    log.log(u'Node disabled.', 'INFO', 0)
                else:
                    # Log node informations
                    log.log(u'Active: ' + node['active'], 'INFO', 0)
                    log.log(u'CSV file: ' + os.path.join(node['src_csv_path'], node['src_csv_file']), 'INFO', 0)
                    log.log(u'XML directory: ' + node['xml_dir'], 'INFO', 0)
                    log.log(u'TMP directory: ' + node['tmp_dir'], 'INFO', 0)

                    # Full path to CSV file
                    csv_filepath = os.path.join(node['tmp_dir'], node['src_csv_file'])
                    # Full path to CSV temp file
                    tmp_csv_file = 'tmp_' + node['src_csv_file']
                    tmp_csv_filepath = os.path.join(node['tmp_dir'], tmp_csv_file)

                    # Create tmp_dir
                    if not os.path.isdir(node['tmp_dir']):
                        os.makedirs(node['tmp_dir'])

                    # Create xml_tmp_dir
                    xml_tmp_dir = os.path.join(node['tmp_dir'], config['MAIN']['xml_tmp_dir'])
                    if not os.path.isdir(xml_tmp_dir):
                        os.makedirs(xml_tmp_dir)

                    log.log(u'Start: ' + str(time.strftime("%Y-%m-%d %H:%M:%S")), 'INFO', 0)
                    # Get file from path or URL
                    helpers.getFile(os.path.join(node['src_csv_path'], node['src_csv_file']), csv_filepath)

                    # Read CSV saved file
                    with open(csv_filepath, 'rt') as csv_file:
                        reader = list(csv.DictReader(csv_file))

                    # metadata_active = 0/1
                    log.log(u'Metadata active: ' + str(config['MAIN']['metadata_active']), 'INFO', 0)
                    if config['MAIN']['metadata_active']:
                        getMetadata(node, reader)

                    log.log(u'Data active: ' + str(config['MAIN']['data_active']), 'INFO', 0)
                    # data_active = 0/1
                    if config['MAIN']['data_active']:
                        getData(node, reader, tmp_csv_filepath)

                    # Copy CSV file to tmp file
                    shutil.copy(csv_filepath, tmp_csv_filepath)

                    log.log(u'End: ' + str(time.strftime("%Y-%m-%d %H:%M:%S")), 'INFO', 0)

                    count_node += 1

            # nodes_file.close()
            count_file += 1
Ejemplo n.º 4
0
    def initialize(self):
        suffix = helpers.getArgument('-w', False)

        if suffix:
            suffix = '-' + helpers.fileNameOnly(suffix, False)

        helpers.setUpLogging(suffix)

        logging.info('Starting\n')

        self.onItemIndex = 0
        self.onKeywordIndex = 0

        # to store the time we finished given sites/keyword combinations
        self.database = Database('database.sqlite')
        self.database.execute(
            'create table if not exists history ( siteName text, keyword text, directory text, gmDate text, primary key(siteName, keyword, directory) )'
        )

        self.downloader = Downloader()
        self.dateStarted = datetime.datetime.now().strftime('%m%d%y')

        outputDirectory = os.path.join(str(Path.home()), 'Desktop',
                                       f'WebSearch_{self.dateStarted}')

        # set default options
        self.options = {
            'inputWebsitesFile': 'input_websites.txt',
            'inputKeywordsFile': '',
            'outputDirectory': outputDirectory,
            'secondsBetweenItems': 0,
            'maximumDaysToKeepItems': 90,
            'maximumResultsPerKeyword': 25000,
            'directoryToCheckForDuplicates': '',
            'useIdLists': 0
        }

        self.keywordsFiles = {}
        self.idListFiles = {}

        # read the options file
        helpers.setOptions('options.ini', self.options)
        helpers.setOptions('options.ini', self.keywordsFiles, 'search terms')
        helpers.setOptions('options.ini', self.idListFiles, 'id lists')

        # read command line parameters
        self.setOptionFromParameter('inputWebsitesFile', '-w')
        self.setOptionFromParameter('inputKeywordsFile', '-s')
        self.setOptionFromParameter('outputDirectory', '-d')

        if '-i' in sys.argv:
            self.options['maximumResultsPerKeyword'] = 1
            logging.info('Downloading by ID list')
            self.options['useIdLists'] = 1

        # read websites file
        file = helpers.getFile(self.options['inputWebsitesFile'])
        self.sites = []

        for item in file.splitlines():
            name = helpers.findBetween(item, '', ' ')
            url = helpers.findBetween(item, ' ', '')

            site = {'name': name, 'url': url}

            self.sites.append(site)

        self.removeOldEntries()