コード例 #1
0
ファイル: scrapper.py プロジェクト: waghcwb/weg-crawler
    def start( self ):
        for index, news in enumerate( self.news_list, start=0 ):
            try:
                if news['status'] == 'pending':
                    news_content = self.download_news( news )

                    if news_content:
                        self.news_list[ index ]['status'] = 'completed'
                        self.news.append( news_content )

                        log.success('[ {nid} ] Dados salvos com sucesso!'.format(nid=news['id']))

                        print()
                        print()
                    else:
                        error_message = 'Não foi possível fazer o parse dos dados.'
                        log.error( error_message )
                        self.errors.append( error_message )
                        self.news_list[ index ]['errors'].append( error_message )
                else:
                    log.warning('Dados já adquiridos [ {nid} ]'.format(nid=news['id']))
            except Exception as error:
                log.error('Erro ao baixar a notícia [ {nid} ]'.format(nid=news['id']))
                log.error(error)
                pass
            finally:
                helper.create_file( filename=self.dump_file, content=self.news, format='json', mode='w')
                helper.create_file( filename=self.news_json_file, content=self.news_list, format='json', mode='w')
コード例 #2
0
ファイル: data.py プロジェクト: waghcwb/weg-crawler
    def create_news_list( self ):
        news_list = helper.read_file(filename=self.news_list_file)
        news = []
        catalog = None
        nid = 0

        for line in news_list.split('\n'):
            if re.search('\[.*\]', line):
                catalog = line.replace('[', '').replace(']', '').replace('\n', '')
            else:
                if line:
                    notice   = 'notice-{catalog}-{id}'.format(catalog=catalog.upper(), id=str( nid ).zfill( self.news_id_length ))
                    link     = line.split(',')[0]
                    language = line.split(',')[1]
                    category = line.split(',')[2]

                    news.append({
                        'id': notice,
                        'link': link,
                        'language': language,
                        'category': category,
                        'errors': [],
                        'status': 'pending',
                        'catalog': catalog
                    })

                    nid += 1
        
        helper.create_file(filename='data/notices.json', content=news, format='json', mode='w')

        return news
コード例 #3
0
ファイル: scrapper.py プロジェクト: waghcwb/weg-crawler
    def __init__( self ):
        super( Scrapper, self ).__init__()

        if not os.path.isfile( self.news_list_file ):
            return exit('Lista para extração de dados não encontrada.')

        self.news_list = helper.read_file( filename=self.news_json_file, format='json' ) or self.create_news_list()
        self.news = helper.read_file( filename=self.dump_file, format='json' ) or []
コード例 #4
0
    def __init__(self,
                 group_name,
                 root_path='./',
                 strict_mode=True,
                 collect_mode=COLLECT_MODE_30DAYS):
        self.collect_mode = collect_mode
        self.root_path = root_path.rstrip('/') + '/'
        self.helper = Helper(self.root_path)
        self.strict_mode = strict_mode
        self.group_name = group_name
        self.__initiate_api()

        if not exists('{}tweets'.format(self.root_path)):
            makedirs('{}tweets'.format(self.root_path))
コード例 #5
0
    def __init__(self, map_file):
        self.map = []
        preprocessed_map = Helper.load_file(map_file)

        # Converting file into 2D array
        for line in preprocessed_map:
            self.map.append(list("".join([line])))
コード例 #6
0
ファイル: parser.py プロジェクト: waghcwb/weg-crawler
def set_image( news, index, link ):
    images_file = 'data/images.json'
    images = helper.read_file( images_file, format='json' ) if os.path.isfile( images_file ) else []

    try:
        images.append({
            'catalog': news['catalog'],
            'notice': news['id'],
            'downloaded': False,
            'original_path': link,
            'new_path': set_image_link( news, index, link )
        })

        helper.create_file(images_file, images, mode='w', format='json')
        log.success('Imagem adicionada para a lista de downloads [ {image_link} ]'.format(image_link=set_image_link( news, index, link )))
    except Exception as error:
        log.error( error )
コード例 #7
0
ファイル: day8.py プロジェクト: bstascavage/advent-of-code
    def __init__(self, instruction_file):
        self.instructions = []
        self.fixed_loop = False

        # This logic split this horrible file into an array of dicts.
        # Since the delemeter is "double newline" instead of a single one,
        # we need to split on "\n\n".
        for line in Helper.load_file(instruction_file):
            processed_line = line.split()
            self.instructions.append(
                {"operation": processed_line[0], "arg": int(processed_line[1])})
コード例 #8
0
    def __init__(self, boading_pass_file):
        self.boarding_passes = []

        for boarding_pass in Helper.load_file(boading_pass_file):
            # The boarding pass format is just binary, so we can subsitute these characters:
            # F = 0
            # B = 1
            # L = 0
            # R = 1
            self.boarding_passes.append(int(boarding_pass.replace("F", "0").replace(
                "B", "1").replace("L", "0").replace("R", "1"), 2))
コード例 #9
0
    def __init__(self, root_path='./'):
        self.root_path = root_path.rstrip('/') + '/'
        self.helper = Helper(self.root_path)
        self.conf_name = 'mongodb_config'

        dsn = self.helper.config_item('{}.dsn'.format(self.conf_name))

        if type(dsn) is str and len(dsn) > 0:
            dbh = MongoClient(dsn)
        else:
            dbh = MongoClient(
                host=self.helper.config_item('{}.host'.format(self.conf_name)),
                port=self.helper.config_item('{}.port'.format(self.conf_name)),
                username=self.helper.config_item('{}.username'.format(
                    self.conf_name)),
                password=self.helper.config_item('{}.password'.format(
                    self.conf_name)))

        self.db = dbh[self.helper.config_item('{}.db_name'.format(
            self.conf_name))]
コード例 #10
0
ファイル: images.py プロジェクト: waghcwb/weg-crawler
    def __init__( self ):
        super( Images, self ).__init__()

        self.images_file = 'data/images.json'
        self.images_folder = 'data/news/'
        self.dump_file     = 'data/news/dump.json'

        if os.path.isfile( self.images_file ):
            images = helper.read_file( self.images_file, format='json' )

            for index, image in enumerate(images, start=0):
                try:
                    if not image['downloaded']:
                        path = 'data/{image_path}'.format(image_path=image['new_path'].replace('https://static.weg.net/', ''))
                        filename = os.path.basename( path )
                        folder = path.split('/')
                        folder.pop()
                        folder = '/'.join( folder )
                        base_url = 'http://www.weg.net'
                        download_url = image['original_path']

                        if not os.path.isdir( folder ):
                            os.makedirs(folder, exist_ok=True)

                        if not download_url.startswith('http'):
                            download_url = '{base_url}/{path}'.format(base_url=base_url, path=download_url)

                        if helper.download(type='image', filename=path, nid=index, url=download_url):
                            images[ index ]['downloaded'] = True
                            log.success('Imagem baixada com sucesso [ {path} ]'.format(path=path))
                    else:
                        log.warning('Imagem já baixada [ {url} ]'.format(url=image['new_path']))
                except Exception as error:
                    log.error( error )
                finally:
                    helper.create_file(self.images_file, images, mode='w', format='json')
        else:
            log.error('[!] Dump de imagens não existe')
コード例 #11
0
ファイル: day4.py プロジェクト: bstascavage/advent-of-code
    def __init__(self, passport_file):
        self.passport_list = []

        # This logic split this horrible file into an array of dicts.
        # Since the delemeter is "double newline" instead of a single one,
        # we need to split on "\n\n".  We also have to do some fancy string manpulation
        # because the file isn't quite in dict format.
        for line in Helper.load_file(passport_file, "\n\n"):
            line = line.replace("\n", " ")

            passport = Passport(
                {i.split(':')[0]: i.split(':')[1]
                 for i in line.split(' ')})
            if passport.has_required_fields:
                self.passport_list.append(passport)
コード例 #12
0
ファイル: day6.py プロジェクト: bstascavage/advent-of-code
    def __init__(self, form_file):
        self.answers = []

        # This logic split this horrible file into an array of dicts.
        # Since the delemeter is "double newline" instead of a single one,
        # we need to split on "\n\n".
        for line in Helper.load_file(form_file, "\n\n"):
            new_group = []

            # Each line in a group is it's own element.
            temp_group = line.split('\n')

            # Transforms the group into a list of sets, where each set is a person's
            # answers.
            for person in temp_group:
                new_group.append(set(person))

            self.answers.append(new_group)
コード例 #13
0
    def __init__(self, baggage_file):
        self.rules_count_unique = {}
        self.rules_count_all = {}
        preprocessed_rules = Helper.load_file(baggage_file)

        # Loading dict for part 1.
        # Format: {bag1: [bag2, bag3], bag2: [bag3, bag4]}
        for rule in preprocessed_rules:

            rule = re.sub(r'(?:bag(s)?(\s)?(\.)?|\d+\s)', '', rule)
            rule = re.sub(r'\s(?:contain|,)\s', "-", rule).strip().split('-')

            bag_key = rule.pop(0)
            for item in rule:
                if item not in self.rules_count_unique:
                    self.rules_count_unique[item] = []
                self.rules_count_unique[item].append(bag_key)

        # Loading dict for part 2.
        # Format: {bag1: [{unit: bag2, quanity: 1}, {unit: bag3, quanity: 3}], \
        #   bag2: [{unit: bag4, quanity: 1}]}
        for rule in preprocessed_rules:
            rule = re.sub(r'bag(s)?(\s)?(\.)?', '', rule)
            rule = re.sub(r'\s(?:contain|,)\s', "-", rule).strip().split('-')

            bag_key = rule.pop(0)
            if bag_key not in self.rules_count_all:
                self.rules_count_all[bag_key] = []
            for item in rule:
                items = item.split(" ", 1)
                if items[0] == "no":
                    break
                self.rules_count_all[bag_key].append({
                    "quanity": int(items[0]),
                    "unit": items[1]
                })
コード例 #14
0
class TweetCollector:

    COLLECT_MODE_30DAYS = 'endpoint_30day'
    COLLECT_MODE_ARCHIVE = 'endpoint_archive'

    def __init__(self,
                 group_name,
                 root_path='./',
                 strict_mode=True,
                 collect_mode=COLLECT_MODE_30DAYS):
        self.collect_mode = collect_mode
        self.root_path = root_path.rstrip('/') + '/'
        self.helper = Helper(self.root_path)
        self.strict_mode = strict_mode
        self.group_name = group_name
        self.__initiate_api()

        if not exists('{}tweets'.format(self.root_path)):
            makedirs('{}tweets'.format(self.root_path))

    def initiate_collection(self, hashtags, toDate=None):
        self.hashtags = hashtags
        self.toDate = toDate

        if self.strict_mode:
            tweet_files = "{}tweets/{}".format(self.root_path, self.group_name)

            if not exists(tweet_files):
                makedirs(tweet_files)

            files = [
                f for f in listdir(tweet_files) if isfile(join(tweet_files, f))
            ]

            if len(files) > 0:
                print(
                    "You're in strict mode. Either enter non strict mode or delete tweet files under ./tweets/{} directory."
                    .format(self.group_name))
                return

        try:
            self.__collect_tweets()
        except TwitterRequestError as e:
            print(
                "Failed to fetch tweets. Check your API limitation in Twitter dashboard.\n"
            )

    def __initiate_api(self):
        consumer_key = self.helper.config_item('twitter_config.consumer_key')
        consumer_secret = self.helper.config_item(
            'twitter_config.consumer_secret')
        access_token = self.helper.config_item('twitter_config.access_token')
        access_token_secret = self.helper.config_item(
            'twitter_config.access_token_secret')

        self.api = TwitterAPI(consumer_key, consumer_secret, access_token,
                              access_token_secret)

    def __collect_tweets(self):
        self.__prepare_request()

        print("Starting to save tweets...\n")
        count = 0
        temp_repo = ""

        for item in self.pager.get_iterator():
            if 'text' in item:
                temp_repo += json.dumps(item)
                temp_repo += "\n"
                count += 1

                if count % 100 == 0:
                    print(
                        "{} tweets already stored in file...\n".format(count))

                    dt = datetime.datetime.now()
                    file_name = '{}tweets/{}/{}{}_{}{}{}.wtr'.format(
                        self.root_path, self.group_name, dt.strftime('%b'),
                        dt.strftime('%d'), dt.strftime('%H'),
                        dt.strftime('%M'), dt.strftime('%S'))

                    f = open(file_name, "a+")
                    f.write(temp_repo)
                    f.close()
                    temp_repo = ""
            elif 'message' in item:
                print("Process Stoped:\n")
                print("{}: {}".format(item['code'], item['message']))
                break
            else:
                print("No Text Entry Detected:\n")
                print(item)
                break

        if len(temp_repo) > 0:
            print("{} tweets already stored in file...\n".format(count))

            dt = datetime.datetime.now()
            file_name = '{}tweets/{}/{}{}_{}{}{}.wtr'.format(
                self.root_path, self.group_name, dt.strftime('%b'),
                dt.strftime('%d'), dt.strftime('%H'), dt.strftime('%M'),
                dt.strftime('%S'))

            f = open(file_name, "a+")
            f.write(temp_repo)
            f.close()

    def __prepare_request(self):
        hash_combine = self.hashtags if (
            type(self.hashtags) is str) else " OR ".join(self.hashtags)
        query = "({}) lang:en".format(hash_combine)
        endpoint = self.helper.config_item('twitter_config.{}'.format(
            self.collect_mode))

        request_config = {'query': query, 'maxResults': 100}

        if self.toDate != None:
            if not self.__validate_parameter(self.toDate, 'toDate'):
                raise Exception(self.validation_error)
            else:
                request_config['toDate'] = self.toDate

        self.pager = TwitterPager(self.api, endpoint, request_config)

    def __validate_parameter(self, value, category):
        output = False

        if category == 'toDate' or category == 'fromDate':
            if type(value) is not str:
                self.validation_error = 'toDate must be in string format'
            elif len(value) != 12:
                self.validation_error = 'toDate must be in yyyyMMddHHmm format'
            else:
                output = True
        else:
            self.validation_error = 'Provided parameter is not supported'

        return output
コード例 #15
0
ファイル: summarization.py プロジェクト: hshahsahebi/wigon
 def __init__(self, communities, root_path='./'):
     self.helper = Helper(root_path)
     self.communities = communities
     self.summarized = []
コード例 #16
0
ファイル: summarization.py プロジェクト: hshahsahebi/wigon
class Summarize:
    def __init__(self, communities, root_path='./'):
        self.helper = Helper(root_path)
        self.communities = communities
        self.summarized = []

    def run(self):
        for community in self.communities:
            sentences = [
                tweet['preprocessed_text']
                for tweet in self.communities[community]
            ]
            vectorize = TfidfVectorizer()
            tfidfs = vectorize.fit_transform(sentences)
            aggregate_tfidf = self.__populate_tweet_tfidf(
                tfidfs, len(sentences), self.communities[community])
            self.__select_most_representative(aggregate_tfidf,
                                              self.communities[community])
        return self.summarized

    def __populate_tweet_tfidf(self, tfidfs, doc_length, tweets):
        result = dict()

        for doc in range(doc_length):
            score = 0
            feature_index = tfidfs[doc, :].nonzero()[1]
            tfidf_scores = zip(feature_index,
                               [tfidfs[doc, x] for x in feature_index])
            for s in [s for (i, s) in tfidf_scores]:
                score += s

            score += self.__compute_tweet_additional_score(tweets[doc])

            result[doc] = score

        result = {
            key: val
            for key, val in sorted(
                result.items(), key=lambda item: item[1], reverse=True)
        }
        return result

    def __compute_tweet_additional_score(self, tweet):
        score = self.helper.config_item('scoring.verified',
                                        1) if tweet['user']['verified'] else 0

        faves = tweet['faves']
        rt = tweet['retweets']
        fave_rt_const = self.helper.config_item('scoring.faves_rt_constant',
                                                0.0005)
        followings = tweet['user'][
            'followings'] if tweet['user']['followings'] > 0 else 1
        followers = tweet['user']['followers']
        popularity_const = self.helper.config_item(
            'scoring.popularity_constant', 0.001)
        word_count = len(tweet['preprocessed_text'].split())
        word_count_constant = self.helper.config_item(
            'scoring.tweet_length_constant', 0.001)

        score += (faves + rt) * fave_rt_const
        score += (followers - followings) * popularity_const
        score += word_count * word_count_constant

        return score

    def __select_most_representative(self, scores, tweets):
        community_representatives = []
        selection_share = self.helper.config_item(
            'global.representative_share', 0.001)
        selection_threshold = ceil(len(tweets) * selection_share)
        counter = 0

        print("    -Selecting {} tweet from community".format(
            selection_threshold))

        for chosen_index in scores:
            counter += 1
            community_representatives.append(tweets[chosen_index])

            if counter > selection_threshold:
                break

        self.summarized.append(community_representatives)
コード例 #17
0
P_PACKAGES = Path('packages').absolute()
P_CLIPS = Path('clips').absolute()
P_OUTPUTS = Path('outputs').absolute()
P_VIDEOS_MEDIA = Path('videos').absolute()
dirs_paths = list([P_PACKAGES, P_CLIPS, P_OUTPUTS, P_VIDEOS_MEDIA])
check_paths()

modules = list()
modules.append(
    Packager(ERROR_MESSAGES, P_PACKAGES, P_CLIPS, P_OUTPUTS, P_VIDEOS_MEDIA))
modules.append(Downloader(ERROR_MESSAGES, modules[0], P_TOKENS_FILE))
modules.append(Encoder(ERROR_MESSAGES, modules[0]))
modules.append(Editor(ERROR_MESSAGES, modules[0], P_VIDEOS_MEDIA))
modules.append(Uploader(ERROR_MESSAGES, modules[0], P_TOKENS, P_VIDEOS_MEDIA))
modules.append(Tweeter(ERROR_MESSAGES, P_TOKENS_FILE, modules[0], P_TWEETS))
modules.append(Wrapper(ERROR_MESSAGES, modules, P_SCHEDULE))
modules.append(Helper())

user_input('clear', [])
welcome()
while True:
    user_inp = input('>> ')

    if user_inp:
        inp = user_inp.split(' ')

        command = inp[0]
        args = inp[1:]

        user_input(command, args)
コード例 #18
0
ファイル: parser.py プロジェクト: waghcwb/weg-crawler
def get_content( news, content ):
    if not content[0]: return ''

    allowed_images_extension = ['.jpeg', '.jpg', '.png', '.gif', '.bmp', '.tif']
    document = BeautifulSoup( content[0].encode('utf-8'), 'html.parser' )
    to_remove = ['comparison', 'bgdark', 'bglight', 'default', 'clr', 'novaJanela']
    link = news['link']
    catalog = news['catalog']
    nid = news['id']

    for item in to_remove:
        if document.select('.{selector}'.format(selector=item)):
            for element in document.select('.{selector}'.format(selector=item)):
                index = element['class'].index( item )
                del element['class'][ index ]

    if document.select('.center'):
        for center in document.select('.center'):
            center['class'] = 'text-center'

    if document.select('p'):
        paragraphs = document.select('p')

        for paragraph in paragraphs:
            for content in paragraph.contents:
                if content == '\xa0' or not content:
                    paragraph.decompose()

    if document.select('table'):
            tables = document.select('table')
            tablefilename = 'logs/weg/tables.list'
            link = link if isinstance( link, str ) else link.attrs['href']
            table_log = '[ {nid} ]: {link}\n'.format(link=link, nid=nid)

            for table in tables:
                to_remove = ['cellpadding', 'border', 'cellspacing', 'width', 'height']
                responsive = document.new_tag('div')
                responsive['class'] = 'table-responsive'
                table.wrap( responsive )

                table['class'].append('table table-bordered table-hover')

                for item in to_remove:
                    del table[ item ]

            if os.path.isfile( tablefilename ):
                content = helper.read_file( tablefilename )

                if link not in content:
                    helper.create_file(tablefilename, table_log)
                else:
                    log.warning('Tabela já adicionada para a lista [ {url} ]'.format(url=link))
            else:
                helper.create_file(tablefilename, table_log)
                log.success('Log de tabelas criado.')

    if document.select('a'):
        for index, link in enumerate( document.select('a'), start=0 ):
            if 'href' in link.attrs:
                filename, file_extension = os.path.splitext( link.attrs['href'] )

                if link.attrs['href'] == 'javascript:void();':
                    link.attrs['href'] = '#{nid}'.format(nid=news['id'])
                    link.attrs['data-prevent-default'] = 'true'

                if file_extension in allowed_images_extension:
                    set_image( news, index, link.attrs['href'] )
                    link.attrs['href'] = set_image_link( news, index, link.attrs['href'] )

    if document.select('img'):
        for index, image in enumerate( document.select('img'), start=0 ):
            filename, file_extension = os.path.splitext( image.attrs['src'] )
            responsive = True

            if file_extension in allowed_images_extension:
                set_image( news, index, image.attrs['src'] )
                image.attrs['src'] = set_image_link( news, index, image.attrs['src'] )

            # for parent in image.parents:
            #     if 'class' in parent.attrs:
            #         if 'coluna6' in parent.attrs['class']:
            #             responsive = False
            # if responsive:
            #     if 'class' in image.attrs:
            #         image.attrs['class'].append('img-responsive')
            #     else:
            #         image.attrs['class'] = 'img-responsive'

    if document.select('.coluna6'):
        columns = document.select('.coluna6')

        for column in columns:
            column['class'] = 'xtt-gallery pull-right'

    if document.select('ul'):
        for ul in document.select('ul'):
            ul['class'] = 'xtt-list-style'

            for li in ul.select('> li'):
                span = document.new_tag('span')
                span.string = li.contents[0]
                li.string = ''
                li.append( span )

    return str( document ).strip()
コード例 #19
0
 def __init__(self, xmas_file):
     self.xmas_data = []
     self.xmas_data = Helper.load_file(xmas_file, cast_int=True)
コード例 #20
0
 def __init__(self, password_file):
     self.password_list = Helper.load_file(password_file)
コード例 #21
0
 def __init__(self, hashtags_list, root_path = './'):
     self.spell = SpellChecker()
     self.helper = Helper(root_path)
     self.slangs = self.helper.slang_hashmap()
     self.hashtags = [ht.strip().lower().replace('#', '') for ht in hashtags_list]
コード例 #22
0
class Preprocessor:

    def __init__(self, hashtags_list, root_path = './'):
        self.spell = SpellChecker()
        self.helper = Helper(root_path)
        self.slangs = self.helper.slang_hashmap()
        self.hashtags = [ht.strip().lower().replace('#', '') for ht in hashtags_list]

    def preprocess_tweet(self, input):
        self.tweet = input
        self.__remove_urls()
        self.__remove_usernames()
        self.__remove_non_latin()
        self.__remove_stopwords()
        self.__prune_slang_dictation()
        self.__remove_stopwords()
        self.__remove_special_chars()
        self.__final_prunning()

        return self.tweet.lower(), self.__ignore_tweet()
    
    def __remove_usernames(self):
        self.tweet = re.sub(r"(?=[^\w])\@\w+(?=[^\w]|$)", r"", self.tweet)

    def __remove_non_latin(self):
        emoji_pattern = re.compile("["
                        u"\U0001F600-\U0001F64F"  # emoticons
                        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                        u"\U0001F680-\U0001F6FF"  # transport & map symbols
                        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                        u"\U00002702-\U000027B0"
                        u"\U000024C2-\U0001F251"
                        "]+", flags=re.UNICODE)
        self.tweet = emoji_pattern.sub(r"", self.tweet)
        self.tweet = self.tweet.encode('ascii', 'ignore').decode('ascii')
        self.tweet = self.tweet.replace("&", "&")
    
    def __remove_special_chars(self):
        self.tweet = self.tweet.replace('\n', ' ').replace('\r', '')
        self.tweet = re.sub(r"[^\w\s]", r"", self.tweet)

    def __remove_urls(self):
        self.tweet = re.sub(r"https://t.co/\w*", r"", self.tweet)

    def __remove_stopwords(self):
        nlp = spacy.load("en_core_web_sm")
        self.tweet = " ".join([token.text for token in nlp(self.tweet) if not token.is_stop])

    def __prune_slang_dictation(self):
        words = word_tokenize(self.tweet)
        new_words = []

        for word in words:
            change = ''
            if self.__should_be_chacked_for_slang(word) and word.upper() in self.slangs:
                change = "Abbr: {} => {}".format(word, self.slangs[word.upper()])
                new_words.append(self.slangs[word.upper()])
            elif self.__should_be_chacked_for_correction(word):
                correct_word = self.spell.correction(word)

                if not word == correct_word:
                    if correct_word.upper() in self.slangs:
                        change = "Abbr Correction: {} => {}".format(word, self.slangs[correct_word.upper()])
                        new_words.append(self.slangs[correct_word.upper()])
                    else:
                        change = "Correction: {} => {}".format(word, correct_word)
                        new_words.append(correct_word)
                else:
                    new_words.append(word)
            else:
                new_words.append(word)

        self.tweet = " ".join(new_words)

    def __ignore_tweet(self):
        words_threshold = self.helper.config_item('global.words_threshold')
        words = word_tokenize(re.sub(r"[^\w\s]", r"", self.tweet))

        return True if len(words) < words_threshold else False

    def __should_be_chacked_for_slang(self, word):
        result = True

        exceptions = self.helper.config_item('global.abbr_exceptions')
        exceptions = [ex.strip().lower() for ex in exceptions.split(',')]

        if word.lower() in exceptions:
            result = False
        elif word.lower() in self.hashtags:
            result = False
        #Possibly a name
        elif word[0].isupper and word[1:].islower():
            result = False
        elif len(word) < 2:
            result = False

        return result

    def __should_be_chacked_for_correction(self, word):
        result = True

        exceptions = self.helper.config_item('global.correction_exceptions')
        exceptions = [ex.strip().lower() for ex in exceptions.split(',')]
        uppercase_chars = [ch for ch in word if ch.isupper()]

        #Ignore words that has other than A to Z characters
        if not re.match(r"^[A-Za-z]$", word):
            result = False
        elif word.lower() in exceptions:
            result = False
        elif word.lower() in self.hashtags:
            result = False
        #Possibly a name
        elif word[0].isupper and word[1:].islower():
            result = False
        #Ignore word if it has more than 1 uppercase letter
        elif len(uppercase_chars) > 1:
            result = False

        return result

    def __final_prunning(self):
        self.tweet = re.sub(r"\b[0-9]+\b", r"", self.tweet)
        self.tweet = re.sub(r"\s+", r" ", self.tweet.strip().lower())
コード例 #23
0
class DBI:

    MODE_NOT_CHANGED = 0
    MODE_UPDATED = 1
    MODE_INSERTED = 2

    def __init__(self, root_path='./'):
        self.root_path = root_path.rstrip('/') + '/'
        self.helper = Helper(self.root_path)
        self.conf_name = 'mongodb_config'

        dsn = self.helper.config_item('{}.dsn'.format(self.conf_name))

        if type(dsn) is str and len(dsn) > 0:
            dbh = MongoClient(dsn)
        else:
            dbh = MongoClient(
                host=self.helper.config_item('{}.host'.format(self.conf_name)),
                port=self.helper.config_item('{}.port'.format(self.conf_name)),
                username=self.helper.config_item('{}.username'.format(
                    self.conf_name)),
                password=self.helper.config_item('{}.password'.format(
                    self.conf_name)))

        self.db = dbh[self.helper.config_item('{}.db_name'.format(
            self.conf_name))]

    def insert(self, collection, document):
        collection = self.db[collection]

        if type(document) is dict:
            result = collection.insert_one(document)
            many = False
        elif type(document) is list:
            result = collection.insert_many(document)
            many = True
        else:
            result = None

        if result is not None and result.acknowledged is True:
            inserted_ids = result.inserted_ids if many is True else result.inserted_id
        else:
            inserted_ids = None

        status = False if inserted_ids is None else True

        return status, inserted_ids

    def upsert(self, collection, filter, document):
        collection = self.db[collection]
        status = False
        mode = None
        instance_id = None

        try:
            result = collection.replace_one(filter, document, upsert=True)

            if result.acknowledged:
                status = True

                if result.matched_count > 0:
                    mode = self.MODE_NOT_CHANGED if result.modified_count == 0 else self.MODE_UPDATED
                    instance_id = result.upserted_id
                else:
                    mode = self.MODE_INSERTED
                    instance_id = result.upserted_id
        except DuplicateKeyError:
            del document['_id']
            return self.upsert(filter, document)
        except WriteError:
            pass

        return status, mode, instance_id

    def row_exists(self, collection, filter):
        collection = self.db[collection]
        result = collection.count_documents(filter)

        return True if result > 0 else False

    def find_all(self, collection, where, custom_fields=None):
        collection = self.db[collection]
        return collection.find(where, custom_fields) if type(
            custom_fields) is dict else collection.find(where)

    def update_one(self, collection, query, update):
        collection = self.db[collection]
        return collection.update_one(query, update)
コード例 #24
0
ファイル: day1.py プロジェクト: bstascavage/advent-of-code
 def __init__(self, expense_file):
     self.expense_list = Helper.load_file(expense_file, cast_int=True)
コード例 #25
0
 def __init__(self, adapter_file):
     self.adapter_list = []
     self.adapter_list = Helper.load_file(adapter_file, cast_int=True)
     self.adapter_list.append(0)
     self.adapter_list.sort()