def getXml(self, dst_dir=None): """Get XML file from row values and copy to xml_dir :param dst_dir: xml_dir to copy XML file :type dst_dir: string :return: True or False :rtype: boolean """ if self.md_file and dst_dir: dst_file = os.path.join(dst_dir, os.path.basename(self.md_file)) helpers.getFile(self.md_file, dst_file, self.disable_ssl_verify)
def readInputFile(self, site, inputType): results = [] fileName = '' # the command line parameter takes priority if self.options['inputKeywordsFile']: fileName = self.options['inputKeywordsFile'] else: siteName = site.get('name').lower() if inputType == 'search terms': fileName = self.keywordsFiles.get(siteName, '') else: fileName = self.idListFiles.get(siteName, '') if not fileName: logging.error(f'No {inputType} file specified for {siteName}.') input("Press enter to continue...") logging.info(f'Using {inputType} file: {fileName}') file = helpers.getFile(fileName) for line in file.splitlines(): results.append(line) if not results: logging.error('No search terms or ID\'s found') input("Press enter to continue...") return results
def run(): # if args.file: # pass # else: # Get list of nodes files from config node_dir nodes_files = [f for f in os.listdir(config['MAIN']['nodes_dir']) if os.path.isfile(os.path.join(config['MAIN']['nodes_dir'], f))] count_file = 1 for nodes_file in nodes_files: log.log('', 'INFO', 0) log.log(color.OKGREEN + u'*' * 80 + color.ENDC, 'INFO', 0) log.log(color.OKGREEN + u'File ' + str(count_file) + '/' + str(len(nodes_files)) + ': ' + nodes_file + '.' + color.ENDC, 'INFO', 0) if nodes_file.startswith('_'): log.log(u'File skipped.', 'INFO', 0) else: file_path = os.path.join(config['MAIN']['nodes_dir'], nodes_file) with open(file_path, 'r') as json_data: data = json.load(json_data) log.log(color.BOLD + u'Organisme: ' + data['organisme'] + color.ENDC, 'INFO', 0) count_node = 1 for node in data['nodes']: log.log('', 'INFO', 0) log.log(color.OKBLUE + u'-' * 80 + color.ENDC, 'INFO', 0) log.log(color.OKBLUE + u'Node ' + str(count_node) + '/' + str(len(data['nodes'])) + color.ENDC, 'INFO', 0) log.log(u'Description: ' + node['description'], 'INFO', 0) if node['active'] == '0': log.log(u'Node disabled.', 'INFO', 0) else: # Log node informations log.log(u'Active: ' + node['active'], 'INFO', 0) log.log(u'CSV file: ' + os.path.join(node['src_csv_path'], node['src_csv_file']), 'INFO', 0) log.log(u'XML directory: ' + node['xml_dir'], 'INFO', 0) log.log(u'TMP directory: ' + node['tmp_dir'], 'INFO', 0) # Full path to CSV file csv_filepath = os.path.join(node['tmp_dir'], node['src_csv_file']) # Full path to CSV temp file tmp_csv_file = 'tmp_' + node['src_csv_file'] tmp_csv_filepath = os.path.join(node['tmp_dir'], tmp_csv_file) # Create tmp_dir if not os.path.isdir(node['tmp_dir']): os.makedirs(node['tmp_dir']) # Create xml_tmp_dir xml_tmp_dir = os.path.join(node['tmp_dir'], config['MAIN']['xml_tmp_dir']) if not os.path.isdir(xml_tmp_dir): os.makedirs(xml_tmp_dir) log.log(u'Start: ' + str(time.strftime("%Y-%m-%d %H:%M:%S")), 'INFO', 0) # Get file from path or URL helpers.getFile(os.path.join(node['src_csv_path'], node['src_csv_file']), csv_filepath) # Read CSV saved file with open(csv_filepath, 'rt') as csv_file: reader = list(csv.DictReader(csv_file)) # metadata_active = 0/1 log.log(u'Metadata active: ' + str(config['MAIN']['metadata_active']), 'INFO', 0) if config['MAIN']['metadata_active']: getMetadata(node, reader) log.log(u'Data active: ' + str(config['MAIN']['data_active']), 'INFO', 0) # data_active = 0/1 if config['MAIN']['data_active']: getData(node, reader, tmp_csv_filepath) # Copy CSV file to tmp file shutil.copy(csv_filepath, tmp_csv_filepath) log.log(u'End: ' + str(time.strftime("%Y-%m-%d %H:%M:%S")), 'INFO', 0) count_node += 1 # nodes_file.close() count_file += 1
def initialize(self): suffix = helpers.getArgument('-w', False) if suffix: suffix = '-' + helpers.fileNameOnly(suffix, False) helpers.setUpLogging(suffix) logging.info('Starting\n') self.onItemIndex = 0 self.onKeywordIndex = 0 # to store the time we finished given sites/keyword combinations self.database = Database('database.sqlite') self.database.execute( 'create table if not exists history ( siteName text, keyword text, directory text, gmDate text, primary key(siteName, keyword, directory) )' ) self.downloader = Downloader() self.dateStarted = datetime.datetime.now().strftime('%m%d%y') outputDirectory = os.path.join(str(Path.home()), 'Desktop', f'WebSearch_{self.dateStarted}') # set default options self.options = { 'inputWebsitesFile': 'input_websites.txt', 'inputKeywordsFile': '', 'outputDirectory': outputDirectory, 'secondsBetweenItems': 0, 'maximumDaysToKeepItems': 90, 'maximumResultsPerKeyword': 25000, 'directoryToCheckForDuplicates': '', 'useIdLists': 0 } self.keywordsFiles = {} self.idListFiles = {} # read the options file helpers.setOptions('options.ini', self.options) helpers.setOptions('options.ini', self.keywordsFiles, 'search terms') helpers.setOptions('options.ini', self.idListFiles, 'id lists') # read command line parameters self.setOptionFromParameter('inputWebsitesFile', '-w') self.setOptionFromParameter('inputKeywordsFile', '-s') self.setOptionFromParameter('outputDirectory', '-d') if '-i' in sys.argv: self.options['maximumResultsPerKeyword'] = 1 logging.info('Downloading by ID list') self.options['useIdLists'] = 1 # read websites file file = helpers.getFile(self.options['inputWebsitesFile']) self.sites = [] for item in file.splitlines(): name = helpers.findBetween(item, '', ' ') url = helpers.findBetween(item, ' ', '') site = {'name': name, 'url': url} self.sites.append(site) self.removeOldEntries()