def extendedProductExtraction(keyword = 'iphone 7', onlyFollowings = False, AllPageButId = False): import paths, SparkLogFileHandler, SearchExtractor, FinalizedRunners, NewProductPreferrer, PythonVersionHandler, Trainer outputFolder = paths.joinPath(paths.HDFSRootFolder, 'weekAugust') pairs = Trainer.readLabeledPairs(outputPath) ids = pairs.flatMap(lambda i: i[0]).distinct() PythonVersionHandler.print_logging(ids.count(), 'ids have been gathered from the labeled pairs by', PythonVersionHandler.nowStr()) productVectorFolder = paths.newProductVectorFolder3 products = Trainer.getProducts(ids, productVectorFolder) Trainer.saveSpecificProduct(products, productsPath)
def extendedPairs(keyword = 'iphone 7', onlyFollowings = False, AllPageButId = False): import paths, SparkLogFileHandler, SearchExtractor, FinalizedRunners, NewProductPreferrer, PythonVersionHandler, Trainer keyword_name = keyword.replace(' ', '_') outputFolder = paths.joinPath(paths.HDFSRootFolder, 'weekAugust') inputPath = paths.joinPath(outputFolder, keyword_name + '/' + keyword_name + '_extractedLogs') logs = FinalizedRunners.getPreparedLogsFromHDFS(inputPath, filtering = False) searchNProductLogs = SearchExtractor.searchNProductLogsForSingleKeyword(logs, keyword) pairs = NewProductPreferrer.trainingInstancesForSingleKeyword(searchNProductLogs, onlyFollowings = onlyFollowings, AllPageButId = AllPageButId) if pairs.isEmpty(): return pairs = pairs.coalesce(24) outputPath, productsPath = getLabeledPairsAndProductsPath(outputFolder, keyword, onlyFollowings = onlyFollowings, AllPageButId = AllPageButId) SparkLogFileHandler.saveRDDToHDFS(pairs, outputPath) ids = pairs.flatMap(lambda i: i[0]).distinct() PythonVersionHandler.print_logging(ids.count(), 'ids have been gathered from the labeled pairs by', PythonVersionHandler.nowStr()) productVectorFolder = paths.newProductVectorFolder3 products = Trainer.getProducts(ids, productVectorFolder) Trainer.saveSpecificProduct(products, productsPath)