def start( self ): for index, news in enumerate( self.news_list, start=0 ): try: if news['status'] == 'pending': news_content = self.download_news( news ) if news_content: self.news_list[ index ]['status'] = 'completed' self.news.append( news_content ) log.success('[ {nid} ] Dados salvos com sucesso!'.format(nid=news['id'])) print() print() else: error_message = 'Não foi possível fazer o parse dos dados.' log.error( error_message ) self.errors.append( error_message ) self.news_list[ index ]['errors'].append( error_message ) else: log.warning('Dados já adquiridos [ {nid} ]'.format(nid=news['id'])) except Exception as error: log.error('Erro ao baixar a notícia [ {nid} ]'.format(nid=news['id'])) log.error(error) pass finally: helper.create_file( filename=self.dump_file, content=self.news, format='json', mode='w') helper.create_file( filename=self.news_json_file, content=self.news_list, format='json', mode='w')
def create_news_list( self ): news_list = helper.read_file(filename=self.news_list_file) news = [] catalog = None nid = 0 for line in news_list.split('\n'): if re.search('\[.*\]', line): catalog = line.replace('[', '').replace(']', '').replace('\n', '') else: if line: notice = 'notice-{catalog}-{id}'.format(catalog=catalog.upper(), id=str( nid ).zfill( self.news_id_length )) link = line.split(',')[0] language = line.split(',')[1] category = line.split(',')[2] news.append({ 'id': notice, 'link': link, 'language': language, 'category': category, 'errors': [], 'status': 'pending', 'catalog': catalog }) nid += 1 helper.create_file(filename='data/notices.json', content=news, format='json', mode='w') return news
def __init__( self ): super( Scrapper, self ).__init__() if not os.path.isfile( self.news_list_file ): return exit('Lista para extração de dados não encontrada.') self.news_list = helper.read_file( filename=self.news_json_file, format='json' ) or self.create_news_list() self.news = helper.read_file( filename=self.dump_file, format='json' ) or []
def __init__(self, group_name, root_path='./', strict_mode=True, collect_mode=COLLECT_MODE_30DAYS): self.collect_mode = collect_mode self.root_path = root_path.rstrip('/') + '/' self.helper = Helper(self.root_path) self.strict_mode = strict_mode self.group_name = group_name self.__initiate_api() if not exists('{}tweets'.format(self.root_path)): makedirs('{}tweets'.format(self.root_path))
def __init__(self, map_file): self.map = [] preprocessed_map = Helper.load_file(map_file) # Converting file into 2D array for line in preprocessed_map: self.map.append(list("".join([line])))
def set_image( news, index, link ): images_file = 'data/images.json' images = helper.read_file( images_file, format='json' ) if os.path.isfile( images_file ) else [] try: images.append({ 'catalog': news['catalog'], 'notice': news['id'], 'downloaded': False, 'original_path': link, 'new_path': set_image_link( news, index, link ) }) helper.create_file(images_file, images, mode='w', format='json') log.success('Imagem adicionada para a lista de downloads [ {image_link} ]'.format(image_link=set_image_link( news, index, link ))) except Exception as error: log.error( error )
def __init__(self, instruction_file): self.instructions = [] self.fixed_loop = False # This logic split this horrible file into an array of dicts. # Since the delemeter is "double newline" instead of a single one, # we need to split on "\n\n". for line in Helper.load_file(instruction_file): processed_line = line.split() self.instructions.append( {"operation": processed_line[0], "arg": int(processed_line[1])})
def __init__(self, boading_pass_file): self.boarding_passes = [] for boarding_pass in Helper.load_file(boading_pass_file): # The boarding pass format is just binary, so we can subsitute these characters: # F = 0 # B = 1 # L = 0 # R = 1 self.boarding_passes.append(int(boarding_pass.replace("F", "0").replace( "B", "1").replace("L", "0").replace("R", "1"), 2))
def __init__(self, root_path='./'): self.root_path = root_path.rstrip('/') + '/' self.helper = Helper(self.root_path) self.conf_name = 'mongodb_config' dsn = self.helper.config_item('{}.dsn'.format(self.conf_name)) if type(dsn) is str and len(dsn) > 0: dbh = MongoClient(dsn) else: dbh = MongoClient( host=self.helper.config_item('{}.host'.format(self.conf_name)), port=self.helper.config_item('{}.port'.format(self.conf_name)), username=self.helper.config_item('{}.username'.format( self.conf_name)), password=self.helper.config_item('{}.password'.format( self.conf_name))) self.db = dbh[self.helper.config_item('{}.db_name'.format( self.conf_name))]
def __init__( self ): super( Images, self ).__init__() self.images_file = 'data/images.json' self.images_folder = 'data/news/' self.dump_file = 'data/news/dump.json' if os.path.isfile( self.images_file ): images = helper.read_file( self.images_file, format='json' ) for index, image in enumerate(images, start=0): try: if not image['downloaded']: path = 'data/{image_path}'.format(image_path=image['new_path'].replace('https://static.weg.net/', '')) filename = os.path.basename( path ) folder = path.split('/') folder.pop() folder = '/'.join( folder ) base_url = 'http://www.weg.net' download_url = image['original_path'] if not os.path.isdir( folder ): os.makedirs(folder, exist_ok=True) if not download_url.startswith('http'): download_url = '{base_url}/{path}'.format(base_url=base_url, path=download_url) if helper.download(type='image', filename=path, nid=index, url=download_url): images[ index ]['downloaded'] = True log.success('Imagem baixada com sucesso [ {path} ]'.format(path=path)) else: log.warning('Imagem já baixada [ {url} ]'.format(url=image['new_path'])) except Exception as error: log.error( error ) finally: helper.create_file(self.images_file, images, mode='w', format='json') else: log.error('[!] Dump de imagens não existe')
def __init__(self, passport_file): self.passport_list = [] # This logic split this horrible file into an array of dicts. # Since the delemeter is "double newline" instead of a single one, # we need to split on "\n\n". We also have to do some fancy string manpulation # because the file isn't quite in dict format. for line in Helper.load_file(passport_file, "\n\n"): line = line.replace("\n", " ") passport = Passport( {i.split(':')[0]: i.split(':')[1] for i in line.split(' ')}) if passport.has_required_fields: self.passport_list.append(passport)
def __init__(self, form_file): self.answers = [] # This logic split this horrible file into an array of dicts. # Since the delemeter is "double newline" instead of a single one, # we need to split on "\n\n". for line in Helper.load_file(form_file, "\n\n"): new_group = [] # Each line in a group is it's own element. temp_group = line.split('\n') # Transforms the group into a list of sets, where each set is a person's # answers. for person in temp_group: new_group.append(set(person)) self.answers.append(new_group)
def __init__(self, baggage_file): self.rules_count_unique = {} self.rules_count_all = {} preprocessed_rules = Helper.load_file(baggage_file) # Loading dict for part 1. # Format: {bag1: [bag2, bag3], bag2: [bag3, bag4]} for rule in preprocessed_rules: rule = re.sub(r'(?:bag(s)?(\s)?(\.)?|\d+\s)', '', rule) rule = re.sub(r'\s(?:contain|,)\s', "-", rule).strip().split('-') bag_key = rule.pop(0) for item in rule: if item not in self.rules_count_unique: self.rules_count_unique[item] = [] self.rules_count_unique[item].append(bag_key) # Loading dict for part 2. # Format: {bag1: [{unit: bag2, quanity: 1}, {unit: bag3, quanity: 3}], \ # bag2: [{unit: bag4, quanity: 1}]} for rule in preprocessed_rules: rule = re.sub(r'bag(s)?(\s)?(\.)?', '', rule) rule = re.sub(r'\s(?:contain|,)\s', "-", rule).strip().split('-') bag_key = rule.pop(0) if bag_key not in self.rules_count_all: self.rules_count_all[bag_key] = [] for item in rule: items = item.split(" ", 1) if items[0] == "no": break self.rules_count_all[bag_key].append({ "quanity": int(items[0]), "unit": items[1] })
class TweetCollector: COLLECT_MODE_30DAYS = 'endpoint_30day' COLLECT_MODE_ARCHIVE = 'endpoint_archive' def __init__(self, group_name, root_path='./', strict_mode=True, collect_mode=COLLECT_MODE_30DAYS): self.collect_mode = collect_mode self.root_path = root_path.rstrip('/') + '/' self.helper = Helper(self.root_path) self.strict_mode = strict_mode self.group_name = group_name self.__initiate_api() if not exists('{}tweets'.format(self.root_path)): makedirs('{}tweets'.format(self.root_path)) def initiate_collection(self, hashtags, toDate=None): self.hashtags = hashtags self.toDate = toDate if self.strict_mode: tweet_files = "{}tweets/{}".format(self.root_path, self.group_name) if not exists(tweet_files): makedirs(tweet_files) files = [ f for f in listdir(tweet_files) if isfile(join(tweet_files, f)) ] if len(files) > 0: print( "You're in strict mode. Either enter non strict mode or delete tweet files under ./tweets/{} directory." .format(self.group_name)) return try: self.__collect_tweets() except TwitterRequestError as e: print( "Failed to fetch tweets. Check your API limitation in Twitter dashboard.\n" ) def __initiate_api(self): consumer_key = self.helper.config_item('twitter_config.consumer_key') consumer_secret = self.helper.config_item( 'twitter_config.consumer_secret') access_token = self.helper.config_item('twitter_config.access_token') access_token_secret = self.helper.config_item( 'twitter_config.access_token_secret') self.api = TwitterAPI(consumer_key, consumer_secret, access_token, access_token_secret) def __collect_tweets(self): self.__prepare_request() print("Starting to save tweets...\n") count = 0 temp_repo = "" for item in self.pager.get_iterator(): if 'text' in item: temp_repo += json.dumps(item) temp_repo += "\n" count += 1 if count % 100 == 0: print( "{} tweets already stored in file...\n".format(count)) dt = datetime.datetime.now() file_name = '{}tweets/{}/{}{}_{}{}{}.wtr'.format( self.root_path, self.group_name, dt.strftime('%b'), dt.strftime('%d'), dt.strftime('%H'), dt.strftime('%M'), dt.strftime('%S')) f = open(file_name, "a+") f.write(temp_repo) f.close() temp_repo = "" elif 'message' in item: print("Process Stoped:\n") print("{}: {}".format(item['code'], item['message'])) break else: print("No Text Entry Detected:\n") print(item) break if len(temp_repo) > 0: print("{} tweets already stored in file...\n".format(count)) dt = datetime.datetime.now() file_name = '{}tweets/{}/{}{}_{}{}{}.wtr'.format( self.root_path, self.group_name, dt.strftime('%b'), dt.strftime('%d'), dt.strftime('%H'), dt.strftime('%M'), dt.strftime('%S')) f = open(file_name, "a+") f.write(temp_repo) f.close() def __prepare_request(self): hash_combine = self.hashtags if ( type(self.hashtags) is str) else " OR ".join(self.hashtags) query = "({}) lang:en".format(hash_combine) endpoint = self.helper.config_item('twitter_config.{}'.format( self.collect_mode)) request_config = {'query': query, 'maxResults': 100} if self.toDate != None: if not self.__validate_parameter(self.toDate, 'toDate'): raise Exception(self.validation_error) else: request_config['toDate'] = self.toDate self.pager = TwitterPager(self.api, endpoint, request_config) def __validate_parameter(self, value, category): output = False if category == 'toDate' or category == 'fromDate': if type(value) is not str: self.validation_error = 'toDate must be in string format' elif len(value) != 12: self.validation_error = 'toDate must be in yyyyMMddHHmm format' else: output = True else: self.validation_error = 'Provided parameter is not supported' return output
def __init__(self, communities, root_path='./'): self.helper = Helper(root_path) self.communities = communities self.summarized = []
class Summarize: def __init__(self, communities, root_path='./'): self.helper = Helper(root_path) self.communities = communities self.summarized = [] def run(self): for community in self.communities: sentences = [ tweet['preprocessed_text'] for tweet in self.communities[community] ] vectorize = TfidfVectorizer() tfidfs = vectorize.fit_transform(sentences) aggregate_tfidf = self.__populate_tweet_tfidf( tfidfs, len(sentences), self.communities[community]) self.__select_most_representative(aggregate_tfidf, self.communities[community]) return self.summarized def __populate_tweet_tfidf(self, tfidfs, doc_length, tweets): result = dict() for doc in range(doc_length): score = 0 feature_index = tfidfs[doc, :].nonzero()[1] tfidf_scores = zip(feature_index, [tfidfs[doc, x] for x in feature_index]) for s in [s for (i, s) in tfidf_scores]: score += s score += self.__compute_tweet_additional_score(tweets[doc]) result[doc] = score result = { key: val for key, val in sorted( result.items(), key=lambda item: item[1], reverse=True) } return result def __compute_tweet_additional_score(self, tweet): score = self.helper.config_item('scoring.verified', 1) if tweet['user']['verified'] else 0 faves = tweet['faves'] rt = tweet['retweets'] fave_rt_const = self.helper.config_item('scoring.faves_rt_constant', 0.0005) followings = tweet['user'][ 'followings'] if tweet['user']['followings'] > 0 else 1 followers = tweet['user']['followers'] popularity_const = self.helper.config_item( 'scoring.popularity_constant', 0.001) word_count = len(tweet['preprocessed_text'].split()) word_count_constant = self.helper.config_item( 'scoring.tweet_length_constant', 0.001) score += (faves + rt) * fave_rt_const score += (followers - followings) * popularity_const score += word_count * word_count_constant return score def __select_most_representative(self, scores, tweets): community_representatives = [] selection_share = self.helper.config_item( 'global.representative_share', 0.001) selection_threshold = ceil(len(tweets) * selection_share) counter = 0 print(" -Selecting {} tweet from community".format( selection_threshold)) for chosen_index in scores: counter += 1 community_representatives.append(tweets[chosen_index]) if counter > selection_threshold: break self.summarized.append(community_representatives)
P_PACKAGES = Path('packages').absolute() P_CLIPS = Path('clips').absolute() P_OUTPUTS = Path('outputs').absolute() P_VIDEOS_MEDIA = Path('videos').absolute() dirs_paths = list([P_PACKAGES, P_CLIPS, P_OUTPUTS, P_VIDEOS_MEDIA]) check_paths() modules = list() modules.append( Packager(ERROR_MESSAGES, P_PACKAGES, P_CLIPS, P_OUTPUTS, P_VIDEOS_MEDIA)) modules.append(Downloader(ERROR_MESSAGES, modules[0], P_TOKENS_FILE)) modules.append(Encoder(ERROR_MESSAGES, modules[0])) modules.append(Editor(ERROR_MESSAGES, modules[0], P_VIDEOS_MEDIA)) modules.append(Uploader(ERROR_MESSAGES, modules[0], P_TOKENS, P_VIDEOS_MEDIA)) modules.append(Tweeter(ERROR_MESSAGES, P_TOKENS_FILE, modules[0], P_TWEETS)) modules.append(Wrapper(ERROR_MESSAGES, modules, P_SCHEDULE)) modules.append(Helper()) user_input('clear', []) welcome() while True: user_inp = input('>> ') if user_inp: inp = user_inp.split(' ') command = inp[0] args = inp[1:] user_input(command, args)
def get_content( news, content ): if not content[0]: return '' allowed_images_extension = ['.jpeg', '.jpg', '.png', '.gif', '.bmp', '.tif'] document = BeautifulSoup( content[0].encode('utf-8'), 'html.parser' ) to_remove = ['comparison', 'bgdark', 'bglight', 'default', 'clr', 'novaJanela'] link = news['link'] catalog = news['catalog'] nid = news['id'] for item in to_remove: if document.select('.{selector}'.format(selector=item)): for element in document.select('.{selector}'.format(selector=item)): index = element['class'].index( item ) del element['class'][ index ] if document.select('.center'): for center in document.select('.center'): center['class'] = 'text-center' if document.select('p'): paragraphs = document.select('p') for paragraph in paragraphs: for content in paragraph.contents: if content == '\xa0' or not content: paragraph.decompose() if document.select('table'): tables = document.select('table') tablefilename = 'logs/weg/tables.list' link = link if isinstance( link, str ) else link.attrs['href'] table_log = '[ {nid} ]: {link}\n'.format(link=link, nid=nid) for table in tables: to_remove = ['cellpadding', 'border', 'cellspacing', 'width', 'height'] responsive = document.new_tag('div') responsive['class'] = 'table-responsive' table.wrap( responsive ) table['class'].append('table table-bordered table-hover') for item in to_remove: del table[ item ] if os.path.isfile( tablefilename ): content = helper.read_file( tablefilename ) if link not in content: helper.create_file(tablefilename, table_log) else: log.warning('Tabela já adicionada para a lista [ {url} ]'.format(url=link)) else: helper.create_file(tablefilename, table_log) log.success('Log de tabelas criado.') if document.select('a'): for index, link in enumerate( document.select('a'), start=0 ): if 'href' in link.attrs: filename, file_extension = os.path.splitext( link.attrs['href'] ) if link.attrs['href'] == 'javascript:void();': link.attrs['href'] = '#{nid}'.format(nid=news['id']) link.attrs['data-prevent-default'] = 'true' if file_extension in allowed_images_extension: set_image( news, index, link.attrs['href'] ) link.attrs['href'] = set_image_link( news, index, link.attrs['href'] ) if document.select('img'): for index, image in enumerate( document.select('img'), start=0 ): filename, file_extension = os.path.splitext( image.attrs['src'] ) responsive = True if file_extension in allowed_images_extension: set_image( news, index, image.attrs['src'] ) image.attrs['src'] = set_image_link( news, index, image.attrs['src'] ) # for parent in image.parents: # if 'class' in parent.attrs: # if 'coluna6' in parent.attrs['class']: # responsive = False # if responsive: # if 'class' in image.attrs: # image.attrs['class'].append('img-responsive') # else: # image.attrs['class'] = 'img-responsive' if document.select('.coluna6'): columns = document.select('.coluna6') for column in columns: column['class'] = 'xtt-gallery pull-right' if document.select('ul'): for ul in document.select('ul'): ul['class'] = 'xtt-list-style' for li in ul.select('> li'): span = document.new_tag('span') span.string = li.contents[0] li.string = '' li.append( span ) return str( document ).strip()
def __init__(self, xmas_file): self.xmas_data = [] self.xmas_data = Helper.load_file(xmas_file, cast_int=True)
def __init__(self, password_file): self.password_list = Helper.load_file(password_file)
def __init__(self, hashtags_list, root_path = './'): self.spell = SpellChecker() self.helper = Helper(root_path) self.slangs = self.helper.slang_hashmap() self.hashtags = [ht.strip().lower().replace('#', '') for ht in hashtags_list]
class Preprocessor: def __init__(self, hashtags_list, root_path = './'): self.spell = SpellChecker() self.helper = Helper(root_path) self.slangs = self.helper.slang_hashmap() self.hashtags = [ht.strip().lower().replace('#', '') for ht in hashtags_list] def preprocess_tweet(self, input): self.tweet = input self.__remove_urls() self.__remove_usernames() self.__remove_non_latin() self.__remove_stopwords() self.__prune_slang_dictation() self.__remove_stopwords() self.__remove_special_chars() self.__final_prunning() return self.tweet.lower(), self.__ignore_tweet() def __remove_usernames(self): self.tweet = re.sub(r"(?=[^\w])\@\w+(?=[^\w]|$)", r"", self.tweet) def __remove_non_latin(self): emoji_pattern = re.compile("[" u"\U0001F600-\U0001F64F" # emoticons u"\U0001F300-\U0001F5FF" # symbols & pictographs u"\U0001F680-\U0001F6FF" # transport & map symbols u"\U0001F1E0-\U0001F1FF" # flags (iOS) u"\U00002702-\U000027B0" u"\U000024C2-\U0001F251" "]+", flags=re.UNICODE) self.tweet = emoji_pattern.sub(r"", self.tweet) self.tweet = self.tweet.encode('ascii', 'ignore').decode('ascii') self.tweet = self.tweet.replace("&", "&") def __remove_special_chars(self): self.tweet = self.tweet.replace('\n', ' ').replace('\r', '') self.tweet = re.sub(r"[^\w\s]", r"", self.tweet) def __remove_urls(self): self.tweet = re.sub(r"https://t.co/\w*", r"", self.tweet) def __remove_stopwords(self): nlp = spacy.load("en_core_web_sm") self.tweet = " ".join([token.text for token in nlp(self.tweet) if not token.is_stop]) def __prune_slang_dictation(self): words = word_tokenize(self.tweet) new_words = [] for word in words: change = '' if self.__should_be_chacked_for_slang(word) and word.upper() in self.slangs: change = "Abbr: {} => {}".format(word, self.slangs[word.upper()]) new_words.append(self.slangs[word.upper()]) elif self.__should_be_chacked_for_correction(word): correct_word = self.spell.correction(word) if not word == correct_word: if correct_word.upper() in self.slangs: change = "Abbr Correction: {} => {}".format(word, self.slangs[correct_word.upper()]) new_words.append(self.slangs[correct_word.upper()]) else: change = "Correction: {} => {}".format(word, correct_word) new_words.append(correct_word) else: new_words.append(word) else: new_words.append(word) self.tweet = " ".join(new_words) def __ignore_tweet(self): words_threshold = self.helper.config_item('global.words_threshold') words = word_tokenize(re.sub(r"[^\w\s]", r"", self.tweet)) return True if len(words) < words_threshold else False def __should_be_chacked_for_slang(self, word): result = True exceptions = self.helper.config_item('global.abbr_exceptions') exceptions = [ex.strip().lower() for ex in exceptions.split(',')] if word.lower() in exceptions: result = False elif word.lower() in self.hashtags: result = False #Possibly a name elif word[0].isupper and word[1:].islower(): result = False elif len(word) < 2: result = False return result def __should_be_chacked_for_correction(self, word): result = True exceptions = self.helper.config_item('global.correction_exceptions') exceptions = [ex.strip().lower() for ex in exceptions.split(',')] uppercase_chars = [ch for ch in word if ch.isupper()] #Ignore words that has other than A to Z characters if not re.match(r"^[A-Za-z]$", word): result = False elif word.lower() in exceptions: result = False elif word.lower() in self.hashtags: result = False #Possibly a name elif word[0].isupper and word[1:].islower(): result = False #Ignore word if it has more than 1 uppercase letter elif len(uppercase_chars) > 1: result = False return result def __final_prunning(self): self.tweet = re.sub(r"\b[0-9]+\b", r"", self.tweet) self.tweet = re.sub(r"\s+", r" ", self.tweet.strip().lower())
class DBI: MODE_NOT_CHANGED = 0 MODE_UPDATED = 1 MODE_INSERTED = 2 def __init__(self, root_path='./'): self.root_path = root_path.rstrip('/') + '/' self.helper = Helper(self.root_path) self.conf_name = 'mongodb_config' dsn = self.helper.config_item('{}.dsn'.format(self.conf_name)) if type(dsn) is str and len(dsn) > 0: dbh = MongoClient(dsn) else: dbh = MongoClient( host=self.helper.config_item('{}.host'.format(self.conf_name)), port=self.helper.config_item('{}.port'.format(self.conf_name)), username=self.helper.config_item('{}.username'.format( self.conf_name)), password=self.helper.config_item('{}.password'.format( self.conf_name))) self.db = dbh[self.helper.config_item('{}.db_name'.format( self.conf_name))] def insert(self, collection, document): collection = self.db[collection] if type(document) is dict: result = collection.insert_one(document) many = False elif type(document) is list: result = collection.insert_many(document) many = True else: result = None if result is not None and result.acknowledged is True: inserted_ids = result.inserted_ids if many is True else result.inserted_id else: inserted_ids = None status = False if inserted_ids is None else True return status, inserted_ids def upsert(self, collection, filter, document): collection = self.db[collection] status = False mode = None instance_id = None try: result = collection.replace_one(filter, document, upsert=True) if result.acknowledged: status = True if result.matched_count > 0: mode = self.MODE_NOT_CHANGED if result.modified_count == 0 else self.MODE_UPDATED instance_id = result.upserted_id else: mode = self.MODE_INSERTED instance_id = result.upserted_id except DuplicateKeyError: del document['_id'] return self.upsert(filter, document) except WriteError: pass return status, mode, instance_id def row_exists(self, collection, filter): collection = self.db[collection] result = collection.count_documents(filter) return True if result > 0 else False def find_all(self, collection, where, custom_fields=None): collection = self.db[collection] return collection.find(where, custom_fields) if type( custom_fields) is dict else collection.find(where) def update_one(self, collection, query, update): collection = self.db[collection] return collection.update_one(query, update)
def __init__(self, expense_file): self.expense_list = Helper.load_file(expense_file, cast_int=True)
def __init__(self, adapter_file): self.adapter_list = [] self.adapter_list = Helper.load_file(adapter_file, cast_int=True) self.adapter_list.append(0) self.adapter_list.sort()