def remove(self): client = Client(self._host, self._port, effective_user=self._user, use_trash=False) it = client.delete([self._partial], recurse=True) for elmt in it: pass
def delete_item(config, filepath='', localpath=''): if(config['BACKEND'] == 'hdfs'): client = Client(socket.gethostname(), config['HADOOP_RPC_PORT'], use_trash=False) del_gen = client.delete([filepath], recurse=True) for del_item in del_gen: pass elif(config['BACKEND'] == 'swift'): pass # To be implemented # Deleting modules or datasets from local directories (will also suffice for nfs backend) if(os.path.isdir(localpath)): # Check if it is a dataset shutil.rmtree(localpath) else: try: os.remove(localpath) except OSError: pass
def crfalign(sc, inputFilename, outputDirectory, limit=LIMIT, location='hdfs', outputFormat="text", partitions=None, deleteFirst=True): # crfConfigDir = os.path.join(os.path.dirname(__file__), "data/config") # def cpath(n): # return os.path.join(crfConfigDir, n) # smEyeColor = HybridJaccard(ref_path=cpath("eyeColor_reference_wiki.txt"), # config_path=cpath("eyeColor_config.txt")) # smHairColor = HybridJaccard(ref_path=cpath("hairColor_reference_wiki.txt"), # config_path=cpath("hairColor_config.txt")) # print smEyeColor, smHairColor if location == "hdfs": if deleteFirst: namenode = "memex-nn1" port = 8020 client = Client(namenode, 8020, use_trash=True) try: for deleted in client.delete([outputDirectory], recurse=True): print deleted except FileNotFoundException as e: pass # hypothesis1: data fetched this way prompts the lzo compression error # hypothesis2: but it doesn't matter, error is just a warning rdd_crfl = sc.textFile(inputFilename) rdd_crfl.setName('rdd_crfl') if limit: rdd_crfl = sc.parallelize(rdd_crfl.take(limit)) if partitions: rdd_crfl = rdd_crfl.repartition(partitions) rdd_final = rdd_crfl print outputFormat if outputFormat == "sequence": rdd_final.saveAsSequenceFile(outputDirectory) elif outputFormat == "text": print "saving to %s" % outputDirectory rdd_final.saveAsTextFile(outputDirectory) else: raise RuntimeError("Unrecognized output format: %s" % outputFormat)
class HdfsReader: """ HdfsReader class Connects to an hdfs endpoint (namenode) and checks argo profile files stored there Uses a specific base path for determining argo file destinations """ def __init__(self, namenode, port, base_path): """ Initialized HdfsReader which is used to check/read profile files from hdfs Args: namenode: str. hdfs namenode host port: int. hdfs namenode port base_path: str. base path to destination used for argo """ self.client = Client(namenode, port) self.base_path = base_path def gen_profile_path(self, tenant, report, profile_type): """ Generates a valid hdfs path to a specific profile Args: tenant: str. tenant to be used report: str. report to be used profile_type: str. profile_type (operations|reports|aggregations|thresholds) Returns: str: hdfs path """ templates = dict() templates.update({ 'operations': '{0}_ops.json', 'aggregations': '{0}_{1}_ap.json', 'reports': '{0}_{1}_cfg.json', 'thresholds': '{0}_{1}_thresholds.json', 'recomputations': 'recomp.json' }) sync_path = self.base_path.replace("{{tenant}}", tenant) filename = templates[profile_type].format(tenant, report) return os.path.join(sync_path, filename) def cat(self, tenant, report, profile_type): """ Returns the contents of a profile stored in hdfs Args: tenant: str. tenant name report: str. report name profile_type: str. profile type (operations|reports|aggregations|thresholds) Returns: """ path = self.gen_profile_path(tenant, report, profile_type) try: txt = self.client.cat([path]) j = json.loads(txt.next().next()) return j, True except FileNotFoundException: return None, False def rem(self, tenant, report, profile_type): """ Removes a profile file that already exists in hdfs (in order to be replaced) Args: tenant: str. tenant name report: str. report name profile_type: str. profile type (operations|reports|aggregations|thresholds) Returns: """ path = self.gen_profile_path(tenant, report, profile_type) try: self.client.delete([path]).next() return True except FileNotFoundException: return False
swift_client = swift.Connection(user=swift_user, key=swift_key, authurl=swift_authurl) # read list of files src_files = [] if run_mode == "hdfs": # spotify's snakebite as hdfs client src_files = [ hdfs_url + files['path'] for files in hdfs_client.ls([source_files]) ] # deleting output directory if exists if (hdfs_client.test(target_dir, exists=True, directory=True)): hdfs_client.delete(target_dir) hdfs_client.rmdir(target_dir) elif run_mode == "swift": # read list of files from swift src_files = [] src_file_regex = re.compile(source_files) for data in swift_client.get_container(source_dir)[1]: if src_file_regex.match(data['name']): src_files.append(data['name']) src_files.sort(key=lambda x: os.path.basename(x)) else: # read list of files from local src_files = filter(os.path.isfile, glob.glob(os.path.join(source_dir, source_files)))
from snakebite.client import Client from constants import * client = Client('localhost', NAMENODE_PORT) for p in client.delete(['/foo/bar','/input'], recurse=True): print p
def get_json(request): response_data = {} if request.method == "GET": r = request.GET rg = request.GET.get ### Фильтр по городу улице дому if r.has_key("action") and rg("action") == 'filter-addresslist': city = request.GET["city"].strip() street = request.GET["street"].strip() house = request.GET["house"].strip() if city != "" or street != "" or house != "": request.session["filter_addresslist"] = pickle.dumps({ 'city': city, 'street': street, 'house': house }) if city == "" and street == "" and house == "": if request.session.has_key("filter_addresslist"): del request.session["filter_addresslist"] response_data = {"result": "ok"} ### Очистка фильтр по городу улице дому if r.has_key("action") and rg("action") == 'filter-addresslist-clear': if request.session.has_key("filter_addresslist"): del request.session["filter_addresslist"] response_data = {"result": "ok"} ### Фильтр по городу улице дому по компании if r.has_key("action") and rg("action") == 'filter-company': city = request.GET["city"].strip() street = request.GET["street"].strip() house = request.GET["house"].strip() company = request.GET["company"].strip() request.session["filter_company"] = pickle.dumps({ 'city': city, 'street': street, 'house': house, 'company': company }) if city == "" and street == "" and house == "" and company == "": if request.session.has_key("filter_company"): del request.session["filter_company"] response_data = {"result": "ok"} ### Очистка фильтр по городу улице дому по компании if r.has_key("action") and rg("action") == 'filter-company-clear': if request.session.has_key("filter_company"): del request.session["filter_company"] response_data = {"result": "ok"} ### Фильтр по договорам if r.has_key("action") and rg("action") == 'filter-contract': inn = request.GET["inn"].strip() manager = request.GET["manager"].strip() company = request.GET["company"].strip() request.session["filter_contract"] = pickle.dumps({ 'manager': manager, 'inn': inn, 'company': company }) if inn == "" and company == "" and manager == "": if request.session.has_key("filter_contract"): del request.session["filter_contract"] response_data = {"result": "ok"} ### Очистка фильтр по договорам if r.has_key("action") and rg("action") == 'filter-contract-clear': if request.session.has_key("filter_contract"): del request.session["filter_contract"] response_data = {"result": "ok"} ### Cписок логов карточки компании if r.has_key("action") and rg("action") == 'get-company-list-logs': company_id = request.GET["company"] company = block_managers.objects.get(pk=int(company_id, 10)) log_list = [] for row in comments_logs.objects.filter( manager=company, log=True).order_by("-datetime_create"): log_list.append({ "comment": row.comment, "user": row.user.get_full_name(), "date": row.datetime_create.strftime("%d.%m.%Y") }) response_data = {"result": "ok", "data": log_list} ### Карточка компании: список коментариев if r.has_key("action") and rg("action") == 'get-company-list-comments': company_id = request.GET["company"] company = block_managers.objects.get(pk=int(company_id, 10)) comment_list = [] for row in comments_logs.objects.filter( manager=company, log=False).order_by("-datetime_create"): comment_list.append({ "comment": row.comment, "user": row.user.get_full_name(), "date": row.datetime_create.strftime("%d.%m.%Y") }) response_data = {"result": "ok", "data": comment_list} ### Карточка компании: список договоров if r.has_key("action") and rg( "action") == 'get-company-list-contracts': company_id = request.GET["company"] company = block_managers.objects.get(pk=int(company_id, 10)) contract_list = [] for row in contracts.objects.filter( company=company).order_by("-datetime_create"): contract_list.append({ "contract_id": row.id, "num": row.num, "date_begin": row.date_begin.strftime("%d.%m.%Y"), "date_end": row.date_end.strftime("%d.%m.%Y"), "goon": u"Да" if row.goon else u"Нет", "money": "%.2f" % row.money, "period": row.period.name, "manager": row.manager.get_full_name(), "author": row.user.get_full_name(), "create": row.datetime_create.strftime("%d.%m.%Y"), "comment": row.comment }) response_data = {"result": "ok", "data": contract_list} ### Карточка компании: данные по одному договору if r.has_key("action") and rg("action") == 'get-company-contract-one': contract_id = request.GET["contract-id"] contract = contracts.objects.get(pk=int(contract_id, 10)) rec = { "contract_id": contract.id, "num": contract.num, "date_begin": contract.date_begin.strftime("%d.%m.%Y"), "date_end": contract.date_end.strftime("%d.%m.%Y"), "goon": "yes" if contract.goon else "no", "money": "%.2f" % contract.money, "period": contract.period.id, "manager": contract.manager.id, "comment": contract.comment } response_data = {"result": "ok", "rec": rec} ### Удаление договора if r.has_key("action") and rg("action") == 'contract-delete': contract_id = request.GET["contract_id"] contract = contracts.objects.get(pk=int(contract_id, 10)) company_id = request.GET["company"] company = block_managers.objects.get(pk=int(company_id, 10)) comments_logs.objects.create( manager=company, user=request.user, comment=u"Удален договор {num} ({author} {create})".format( num=contract.num, author=contract.user.get_full_name(), create=contract.datetime_create.strftime("%d.%m.%Y")), log=True) contract.delete() response_data = {"result": "ok"} ### Карточка компании: список загруженных файлов if r.has_key("action") and rg( "action") == 'get-company-list-hdfs-files': company_id = request.GET["company"] company = block_managers.objects.get(pk=int(company_id, 10)) file_list = [] for row in files.objects.filter( company=company).order_by("-datetime_load"): file_list.append({ "file_id": row.id, "filename": row.filename, "author": row.user.get_full_name(), "create": row.datetime_load.strftime("%d.%m.%Y") }) response_data = {"result": "ok", "data": file_list} ### Карточка компании: удаление загруженного файла if r.has_key("action") and rg("action") == 'company-file-delete': file_id = request.GET["file_id"] company_id = request.GET["company"] company = block_managers.objects.get(pk=int(company_id, 10)) fob = files.objects.get(pk=file_id) comments_logs.objects.create( manager=company, user=request.user, comment=u"Удален файл {filename}".format( filename=fob.filename), log=True) fob.delete() client = Client('10.6.0.135', 9000) for x in client.delete([ '/blocks/%s' % file_id, ], recurse=True): print x response_data = {"result": "ok"} #### Поиск по компании if r.has_key("term") and rg("term") != "": term = request.GET["term"] obj = [] data = block_managers.objects.filter( Q(name__icontains=term) | Q(inn__icontains=term)) for row in data: obj.append({ "label": u"{name} (ИНН {inn})".format(name=row.name, inn=row.inn), "value": row.id }) response_data = obj ### Cписок логов карточки дома if r.has_key("action") and rg("action") == 'get-house-list-logs': house_id = request.GET["house"] house = buildings.objects.get(pk=int(house_id, 10)) log_list = [] for row in comments_logs.objects.filter( house=house, log=True).order_by("-datetime_create"): log_list.append({ "comment": row.comment, "user": row.user.get_full_name(), "date": row.datetime_create.strftime("%d.%m.%Y") }) response_data = {"result": "ok", "data": log_list} ### Карточка дома: список коментариев if r.has_key("action") and rg("action") == 'get-house-list-comments': house_id = request.GET["house"] house = buildings.objects.get(pk=int(house_id, 10)) comment_list = [] for row in comments_logs.objects.filter( house=house, log=False).order_by("-datetime_create"): comment_list.append({ "comment": row.comment, "user": row.user.get_full_name(), "date": row.datetime_create.strftime("%d.%m.%Y") }) response_data = {"result": "ok", "data": comment_list} if request.method == "POST": data = eval(request.body) ### Сохранение карточки компании if data.has_key("action") and data["action"] == 'company-common-save': company_id = data["company_id"] company = block_managers.objects.get(pk=int(company_id, 10)) address_id = data["address"] address_law_id = data["address_law"] address = address_house.objects.get(pk=int(address_id, 10)) address_law = address_house.objects.get(pk=int(address_law_id, 10)) name = data["name"].strip() inn = data["inn"].strip() phone = data["phone"].strip() email = data["email"].strip() contact = data["contact"].strip() company.name = name company.inn = inn company.phone = phone company.email = email company.contact = contact company.address = address company.address_law = address_law company.save() comments_logs.objects.create( manager=company, user=request.user, comment=u"Сохранены данные карточки компании", log=True) response_data = {"result": "ok"} ### Добавление компании if data.has_key( "action") and data["action"] == 'company-common-create': id_www = int(data["id_www"].strip(), 10) address_id = data["address"] address_law_id = data["address_law"] address = address_house.objects.get(pk=int(address_id, 10)) address_law = address_house.objects.get(pk=int(address_law_id, 10)) name = data["name"].strip() inn = data["inn"].strip() phone = data["phone"].strip() email = data["email"].strip() contact = data["contact"].strip() ### Проверка есть ли уже такой id_www if not block_managers.objects.filter(www_id=id_www).exists( ) and not block_managers.objects.filter(inn=inn).exists(): new = block_managers.objects.create(www_id=id_www, name=name, inn=inn, phone=phone, email=email, contact=contact, address=address, address_law=address_law) comments_logs.objects.create( manager=new, user=request.user, comment=u"Создана карточка компании", log=True) response_data = {"result": "ok", "id": new.id} else: response_data = {"result": "error"} ### Добавление коментария компании if data.has_key("action") and data["action"] == 'company-comment-add': company_id = data["company"] company = block_managers.objects.get(pk=int(company_id, 10)) comment = data["comment"].strip() if comment != "": comments_logs.objects.create(manager=company, user=request.user, comment=comment) response_data = {"result": "ok"} ### Добавление договора компании if data.has_key("action") and data["action"] == 'contract-create': company_id = data["company"] company = block_managers.objects.get(pk=int(company_id, 10)) contracts.objects.create( company=company, num=data["num"].strip(), date_begin=datetime.datetime.strptime(data["date_begin"], "%d.%m.%Y"), date_end=datetime.datetime.strptime(data["date_end"], "%d.%m.%Y"), goon=True if data["goon"] == "yes" else False, money=Decimal(data["money"]), period=pay_period.objects.get(pk=int(data["period"], 10)), manager=User.objects.get(pk=int(data["manager"], 10)), user=request.user, comment=data["comment"].strip()) response_data = {"result": "ok"} ### Сохранение договора компании if data.has_key("action") and data["action"] == 'contract-edit': company_id = data["company"] company = block_managers.objects.get(pk=int(company_id, 10)) contract_id = data["contract_id"] contract = contracts.objects.get(pk=int(contract_id, 10)) contract.num = data["num"].strip() contract.date_begin = datetime.datetime.strptime( data["date_begin"], "%d.%m.%Y") contract.date_end = datetime.datetime.strptime( data["date_end"], "%d.%m.%Y") contract.goon = True if data["goon"] == "yes" else False contract.money = Decimal(data["money"]) contract.period = pay_period.objects.get( pk=int(data["period"], 10)) contract.manager = User.objects.get(pk=int(data["manager"], 10)) contract.comment = data["comment"].strip() contract.save() comments_logs.objects.create( manager=company, user=request.user, comment=u"Сохранены данные договора {num} ({author} {create})". format(num=contract.num, author=contract.user.get_full_name(), create=contract.datetime_create.strftime("%d.%m.%Y")), log=True) response_data = {"result": "ok"} ### Сохранение карточки дома if data.has_key("action") and data["action"] == 'house-common-save': house_id = data["house"] house = buildings.objects.get(pk=int(house_id, 10)) address_id = data["address"] address = address_house.objects.get(pk=int(address_id, 10)) company_id = data["manager"] company = block_managers.objects.get(pk=int(company_id, 10)) numstoreys = data["numstoreys"].strip() numentrances = data["numentrances"].strip() numfloars = data["numfloars"].strip() access = data["access"].strip() house.numstoreys = numstoreys house.numentrances = numentrances house.numfloars = numfloars house.access = access house.address = address house.block_manager = company house.save() comments_logs.objects.create( house=house, user=request.user, comment=u"Сохранены данные карточки дома", log=True) response_data = {"result": "ok"} ### Создание карточки дома if data.has_key("action") and data["action"] == 'house-common-create': id_www = int(data["id_www"].strip(), 10) address_id = data["address"] address = address_house.objects.get(pk=int(address_id, 10)) company_id = data["manager"] company = block_managers.objects.get(pk=int(company_id, 10)) numstoreys = int(data["numstoreys"].strip(), 10) numentrances = int(data["numentrances"].strip(), 10) numfloars = int(data["numfloars"].strip(), 10) access = data["access"].strip() ### Проверка уникальности www_id if not block_managers.objects.filter(www_id=id_www).exists(): new = buildings.objects.create(www_id=id_www, numstoreys=numstoreys, numentrances=numentrances, numfloars=numfloars, access=access, address=address, block_manager=company) comments_logs.objects.create(house=new, user=request.user, comment=u"Создана карточка дома", log=True) response_data = {"result": "ok", "id": new.id} else: response_data = {"result": "error"} ### Добавление коментария дома if data.has_key("action") and data["action"] == 'house-comment-add': house_id = data["house"] house = buildings.objects.get(pk=int(house_id, 10)) comment = data["comment"].strip() if comment != "": comments_logs.objects.create(house=house, user=request.user, comment=comment) response_data = {"result": "ok"} response = HttpResponse(json.dumps(response_data), content_type="application/json") response['Access-Control-Allow-Origin'] = "*" return response
def remove(self): client = Client(self._host, self._port, effective_user=self._user, use_trash=False) it = client.delete([self._partial], recurse=True) for elmt in it: pass
def delete(): client = Client("study", 9000, use_trash=False) client.delete(["/data/gz"], recurse=False)
def crfalign(sc, inputFilename, outputDirectory, limit=LIMIT, location='hdfs', outputFormat="text", partitions=None, deleteFirst=True): crfConfigDir = os.path.join(os.path.dirname(__file__), "data/config") def cpath(n): return os.path.join(crfConfigDir, n) smEyeColor = HybridJaccard(ref_path=cpath("eyeColor_reference_wiki.txt"), config_path=cpath("eyeColor_config.txt")) smHairColor = HybridJaccard(ref_path=cpath("hairColor_reference_wiki.txt"), config_path=cpath("hairColor_config.txt")) print smEyeColor, smHairColor if location == "hdfs": if deleteFirst: namenode = "memex-nn1" port = 8020 client = Client(namenode, 8020, use_trash=True) try: for deleted in client.delete([outputDirectory], recurse=True): print deleted except FileNotFoundException as e: pass # hypothesis1: data fetched this way prompts the lzo compression error # hypothesis2: but it doesn't matter, error is just a warning if partitions: if limit: rdd_crfl = sc.parallelize(rdd_crfl.take(limit)) rdd_crfl = rdd_crfl.repartition(partitions) else: print inputFilename rdd_crfl = sc.textFile(inputFilename, minPartitions=partitions) else: rdd_crfl = sc.textFile(inputFilename) rdd_crfl.setName('rdd_crfl') # rdd_crfl.persist() print "beginning: %s partitions" % rdd_crfl.getNumPartitions() # "value-only" RDD, not a pair RDD # but we have the URI in the -3 position # and the index in the -2 position rdd_withuri = rdd_crfl.map(lambda x: reconstructTuple(x)) # Note: groupByKey returns iterable, not data; so no point in printing rdd_grouped = rdd_withuri.groupByKey() # sort the vectors by index (within key groups) rdd_sorted = rdd_grouped.mapValues(lambda x: [l[1:] for l in sorted(x, key=lambda r: int(r[0]))]) # find all contiguous spans of marked-up tokens # returns 0 or more dicts per URI key rdd_spans = rdd_sorted.mapValues(lambda x: computeSpans(x, indexed=True)) # flatten to (URI, single dict) on each line rdd_flat = rdd_spans.flatMapValues(lambda x: list(x)) # rdd_flat = rdd_flat.coalesce(rdd_flat.getNumPartitions() / 3) # # map any eyeColor spans using smEyeColor, hairType spans using smHairColor # rdd_aligned = rdd_flat.mapValues(lambda x: alignToControlledVocab(x, {"eyeColor": smEyeColor, "hairType": smHairColor})) rdd_aligned = rdd_flat.mapValues(lambda x: alignToControlledVocab(x, {"eyeColor": smEyeColor.findBestMatch, "hairType": smHairColor.findBestMatch})) # rdd_aligned = rdd_flat.mapValues(lambda x: alignToControlledVocab(x, {"eyeColor": fakeFindBestMatch, "hairType": fakeFindBestMatch})) # rdd_aligned = rdd_flat.mapValues(lambda x: alignToControlledVocab(x, {})) # rdd_aligned = rdd_spans # rdd_final = rdd_crfl rdd_final = rdd_aligned print outputFormat if outputFormat == "sequence": rdd_final.saveAsSequenceFile(outputDirectory) elif outputFormat == "text": print "saving to %s" % outputDirectory rdd_final.saveAsTextFile(outputDirectory) else: raise RuntimeError("Unrecognized output format: %s" % outputFormat)
from snakebite.client import Client client = Client('localhost', 8020) #port is the RPC port of the namenode. for i in client.ls(['/user/cloudera/behrouz']): #takes a list of paths!! print i #get this parameters from /etc/hadoop/conf/core-site.xml under the fs.defaults #many of the methods in snake bite return generators #creating a directory: #create two directories behrouz, behrouz1/b1 on HDFS: print '*' * 40 for p in client.mkdir(['/behrouz', 'behrouz1/b1'], create_parent=True): print p print '*' * 40 #deleting files and directories: deletes any subdirectories and files a directory contains #recursively deleting the directories! for p in client.delete(['/behrouz', 'behrouz1/b1'], recurse=True): print p print '*' * 40 # retrieving data from hdfs: #copying files from HDFS to Local file system: for f in client.copyToLocal(['/user/cloudera/wordCount.out'], '/home/cloudera/'): print f print '*' * 40 ####### #reading contents of a file for l in client.text(['/user/cloudera/testfile.txt']): print l #the text method automatically decompress and display gzip and bzip2 files.
from snakebite.client import Client client = Client('localhost', 8020) for p in client.delete(['/user/hadoop/test'], recurse=True): print(p)
{'group': u'supergroup', 'permission': 420, 'file_type': 'f', 'access_time': 1605964109596L, 'block_replication': 2, 'modification_time': 1605946691680L, 'length': 19L, 'blocksize': 134217728L, 'owner': u'student9_7', 'path': '/student9_7/test'} {'group': u'supergroup', 'permission': 420, 'file_type': 'f', 'access_time': 1605964267111L, 'block_replication': 3, 'modification_time': 1605964267975L, 'length': 19L, 'blocksize': 134217728L, 'owner': u'student9_7', 'path': '/student9_7/test2'} {'group': u'supergroup', 'permission': 493, 'file_type': 'd', 'access_time': 0L, 'block_replication': 0, 'modification_time': 1605950057832L, 'length': 0L, 'blocksize': 0L, 'owner': u'student9_7', 'path': '/student9_7/testdir'} ''' # Создадим пару директорий for p in client.mkdir(['/student9_7/py_dir_01', '/student9_7/py_dir_02'], create_parent=True): print(p) ''' {'path': '/student9_7/py_dir_01', 'result': True} {'path': '/student9_7/py_dir_02', 'result': True} ''' # Удалим директорию `py_dir_01` for p in client.delete(['/student9_7/py_dir_01'], recurse=True): print(p) ''' {'path': '/student9_7/py_dir_01', 'result': True} ''' # Посмотрим что содержится в файле `test` for t in client.text(['/student9_7/test']): print(t) ''' test file for hdfs ''' # Скопируем файл `test` из хранилища в локальную домашнюю директорию под именем `retrived_file_via_py` for f in client.copyToLocal(['/student9_7/test'], 'retrived_file_via_py'): print(f)
#!/usr/bin/env python from snakebite.client import Client client = Client('localhost', 9000) # recurse=True is equivalent to rm -rf so be careful! for p in client.delete(['/foo', '/another'], recurse=True): print p
swift_client = swift.Connection( user = swift_user, key = swift_key, authurl = swift_authurl) # read list of files src_files = [] if run_mode == "hdfs": # spotify's snakebite as hdfs client src_files = [ hdfs_url + files['path'] for files in hdfs_client.ls([source_files]) ] # deleting output directory if exists if (hdfs_client.test(target_dir, exists = True, directory = True)): hdfs_client.delete(target_dir) hdfs_client.rmdir(target_dir) elif run_mode == "swift": # read list of files from swift src_files = [] source_files = '|'.join([ '(pagecounts-' + (datetime.now() - timedelta(hours=i)).strftime("%Y%m%d-%H") + '(.*))' for i in range(48, 71) ]) src_file_regex = re.compile(source_files) for data in swift_client.get_container(source_dir)[1]: if src_file_regex.match(data['name']): src_files.append(data['name']) src_files.sort(key = lambda x: os.path.basename(x)) else: # read list of files from local src_files = filter(os.path.isfile, glob.glob(os.path.join(source_dir, source_files)))