Example #1
0
def create_index(index):
    status = check_index(index)

    if not status:
        create_index_util(index)
        log("Index Created in main")
    get_index_mapping(index)
Example #2
0
    def final_optimised_model(self, model, _train, y, epochs=None, batch_size=None):
        try:
            # fit the keras model on the dataset
            model.fit(_train, y, epochs=epochs, batch_size=batch_size)

            # evaluate the keras model
            _, accuracy = model.evaluate(_train, y)
            print('Accuracy: %.2f' % (accuracy*100))

            # make class predictions with the model
            # predictions_2 = model.predict_classes(_train)
            time.sleep(10)
            try:
                # serialize model to JSON
                model_json = model.to_json()
                with open("model.json", "w") as json_file:
                    json_file.write(model_json)
                # serialize weights to HDF5
                model.save_weights("model.h5")
                time.sleep(100)
            except Exception as e:
                log('----------Error in Model.json ----------:{}'.format(e), 'error')
                raise e

            # if 'model.json' and 'model.h5':

            # load json and create model



            print("End of the model running")
        except Exception as e:
            log('----------Error in Clean_ts function ----------:{}'.format(e), 'error')
            raise e
        return
Example #3
0
    def data_processing(self, data):

        try:
            data[self.textcolumn] = data[self.textcolumn].apply(self.text_preprocess)
            # data['content'] = self.text_preprocess(data['content'])

            data = data[data[self.engagement] > 0]

            data['engagement_bucket'] = pd.qcut(data[self.engagement], q=[0,0.5, 0.75, 1], labels=['Low', 'Medium', 'High'])

            # Creating time related features such as time, day, etc.
            data['day'] = data[self.date].dt.day
            data['hour'] = data[self.date].dt.hour
            data['week_day'] = data[self.date].dt.weekday

            # hour = data.groupby('hour')[self.engagement].mean()
            # weekday = data.groupby('week_day')[self.engagement].mean()
            # dayofmonth = data.groupby('day')[self.engagement].mean()

            X = data[['word_count', 'hour', 'week_day']]
            X = pd.get_dummies(X, drop_first=True)

            X[self.textcolumn] = data[self.textcolumn]
            X.reset_index(drop=True,inplace=True)

            y= data['engagement_bucket']
            # y = pd.get_dummies(y)
        except Exception as e:
            log('----------Error in Data Processing ----------:{}'.format(e), 'error')
            raise e
        return X, y
Example #4
0
 def BAD_SYMBOLS_RE(self):
     try:
         BAD_SYMBOLS_RE = re.compile("[^0-9a-z #+_]")
     except Exception as e:
         log('----------Error in BAD_SYMBOLS_RE function ----------:{}'.format(e), 'error')
         raise e
     return BAD_SYMBOLS_RE
Example #5
0
 def balancing(self,_train , y ):
     try:
         smote = SMOTE('minority')
         _train, y = smote.fit_sample(_train, y)
     except Exception as e:
         log('----------Error in Smote ----------:{}'.format(e), 'error')
         raise e
     return _train, y
Example #6
0
    def REPLACE_BY_SPACE_RE(self):
        try:
             REPLACE_BY_SPACE_RE = re.compile("[/(){}\[\]\|@,;!]")

        except Exception as e:
            log('----------Error in REPLACE_BY_SPACE_RE function ----------:{}'.format(e), 'error')
            raise e
        return REPLACE_BY_SPACE_RE
Example #7
0
 def optimized_model(self):
     try:
         model = self.model()
         loaded_model = self.final_optimised_model(model, self._train, self.label,self.best_params['epochs'],
                                                   self.best_params['batch_size'])
     except Exception as e:
         log('----------Error in Optimized Modeln ----------:{}'.format(e), 'error')
         raise e
     return loaded_model
Example #8
0
    def text_preprocess(self, text):
        try:
            """
                text: a string
    
                return: modified initial string
            """
            negation = ["no", "nor", "not", "don", "don't", "aren", "aren't", "couldn", "couldn't", "didn", "didn't",
                        "doesn", "doesn't",
                        "hadn", "hadn't", "hasn", "hasn't", "haven", "haven't", "isn", "isn't", "mightn", "mightn't",
                        "mustn", "mustn't",
                        "needn", "needn't", "shan", "shan't", "shouldn", "shouldn't", 'wasn', "wasn't", 'weren', "weren't",
                        'won', "won't", 'wouldn', "wouldn't"]
            stop = set(stopwords.words('english')) - set(negation)

            # Custom stopwords
            stoplist = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll",
                        "you'd", 'your',
                        'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers',
                        'herself', 'it',
                        "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who',
                        'whom', 'this', 'that', "that'll",
                        'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had',
                        'having', 'do', 'does', 'did',
                        'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at',
                        'by', 'for', 'with', 'about',
                        'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to',
                        'from', 'up', 'down', 'in', 'out',
                        'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where',
                        'why', 'all', 'any',
                        'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'only', 'own', 'same', 'so', 'than',
                        'too',
                        'very', 's', 't', 'can', 'will', 'just', 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're',
                        've', 'y', 'rt', 'rt', 'qt', 'for',
                        'the', 'with', 'in', 'of', 'and', 'its', 'it', 'this', 'i', 'have', 'has', 'would', 'could', 'you',
                        'a', 'an',
                        'be', 'am', 'can', 'edushopper', 'will', 'to', 'on', 'is', 'by', 'ive', 'im', 'your', 'we', 'are',
                        'at', 'as', 'any', 'ebay', 'thank', 'hello', 'know',
                        'need', 'want', 'look', 'hi', 'sorry', 'http', 'body', 'dear', 'hello', 'hi', 'thanks', 'sir',
                        'tomorrow', 'sent', 'send', 'see', 'there', 'welcome', 'what', 'well', 'us']

            stop.update(set(stoplist))
            REPLACE_BY_SPACE_RE = re.compile("[/(){}\[\]\|@,;!]")
            BAD_SYMBOLS_RE = re.compile("[^0-9a-z #+_]")
            text = re.sub(r'\d', '', str(text))  # removing digits
            text = re.sub(r"(?:\@|https?\://)\S+", "", str(text))  # removing mentions and urls
            text = text.lower()  # lowercase text
            text = re.sub('[0-9]+', '', text)
            text = REPLACE_BY_SPACE_RE.sub(" ", text)  # replace REPLACE_BY_SPACE_RE symbols by space in text
            text = BAD_SYMBOLS_RE.sub(" ", text)  # delete symbols which are in BAD_SYMBOLS_RE from text
            text = ' '.join([word for word in text.split() if word not in stop])  # delete stopwors from text
            text = text.strip()
        except Exception as e:
            log('----------Error in Text Processing ----------:{}'.format(e), 'error')
            raise e
        return text
Example #9
0
 def tryUse(self):
     try:
         log('try')
         r = 10 / 0
         log(f'result = {r}')
     except ZeroDivisionError as e:
         log(f'except =  {e}')
     finally:
         log('finally')
     log('end')
Example #10
0
    def TfidfVectorizer(self, X, y ):
        try:
            vec = TfidfVectorizer(strip_accents='unicode', ngram_range=(1,2), max_features=3000, smooth_idf=True, sublinear_tf=True)
            train_vec = vec.fit_transform(X[self.textcolumn])

            _train = np.hstack([X.drop(self.textcolumn, axis=1), train_vec.toarray()])
            y = LabelEncoder().fit_transform(y)
            scaler = Normalizer().fit(_train)
            _train = scaler.transform(_train)
        except Exception as e:
            log('----------Error in TfidVectorizer ----------:{}'.format(e), 'error')
            raise e
        return _train , y
Example #11
0
 def result(self):
     try:
         data = self.read_data()
         X,y = self.data_processing(data)
         _train, y = self.TfidfVectorizer(X,y)
         _train, y = self.balancing(_train, y)
         self._train = _train
         self.label = y
         best_score, best_params = self.updating_hyperameters(self.model, _train, y)
         self.best_params = best_params
     except Exception as e:
         log('----------Error in Result ----------:{}'.format(e), 'error')
         raise e
     return self.optimized_model()
Example #12
0
def read_file():
    try:
        f = open('C:/Users/agoto/Desktop/log.txt')
        log(f.read())
    finally:
        if f:
            f.close()
    # Python引入了with语句来自动帮我们调用close()方法
    # 标示符'r'表示读,读取二进制文件,比如图片、视频等等,用'rb'模式打开
    with open('C:/Users/agoto/Desktop/log.txt', 'r') as f:
        print(f.read(100))
    for line in f.readlines():
        print(line.strip())
    # codecs模块帮我们在读文件时自动转换编码,直接读出unicode
    with codecs.open('C:/Users/agoto/Desktop/test.log', 'r', 'gbk') as f:
        print(f.read())
Example #13
0
    def read_data(self):
        try:
            data = pd.read_excel(self.dataset, parse_dates=[self.date])
            data[self.date] = pd.to_datetime(data[self.date], errors='coerce')
            # data[self.date] = data[self.date].dt.strftime('%Y/%m/%d')
            data[self.engagement] = data[self.engagement].astype(int)
            data[self.wordcount] = data[self.wordcount].astype(int)
            # source['Tweet_type'] = source['Tweet_type'].astype('category')
            print(data[[self.date, self.textcolumn, self.wordcount, self.engagement]])
            source = data[[self.date, self.textcolumn, self.wordcount, self.engagement]]


        except Exception as e:
            log('----------Error in Read data ----------:{}'.format(e), 'error')
            raise e
        return source
Example #14
0
    def model(self, kernel_initializer='glorot_uniform',  activation = 'relu', dropout_rate=0.5, weight_constraint=0):
        try:
        # define the keras model
            model = Sequential()
            model.add(Dense(300, input_dim=3003, activation=activation, kernel_initializer=kernel_initializer, kernel_constraint=min_max_norm(min_value=1.0, max_value=1.0)))
            model.add(Dropout(dropout_rate))
            model.add(Dense(200, activation=activation, kernel_initializer=kernel_initializer,kernel_constraint=min_max_norm(min_value=1.0, max_value=1.0)))
            model.add(Dropout(dropout_rate))
            model.add(Dense(100, activation=activation, kernel_initializer=kernel_initializer, kernel_constraint=min_max_norm(min_value=1.0, max_value=1.0)))
            model.add(Dropout(dropout_rate))
            model.add(Dense(3, activation='softmax', kernel_initializer=kernel_initializer, kernel_constraint=min_max_norm(min_value=1.0, max_value=1.0)))

            # compile the keras model
            # optimizer = SGD(lr=learn_rate, momentum=momentum)
            model.compile(loss='sparse_categorical_crossentropy', optimizer = 'adam', metrics=['accuracy'])
        except Exception as e:
            log('----------Error in Clean_ts function ----------:{}'.format(e), 'error')
            raise e
        return model
Example #15
0
 def updating_hyperameters(self, create_model=None, X=None, Y=None):
     try:
         model = KerasClassifier(build_fn=create_model, verbose=1)
         # define the grid search parameters
         batch_size = [10, 20, 40, 60, 80, 100]
         epochs = [10, 50, 100]
         optimizer = ['SGD', 'Adam']
         learn_rate = [0.001, 0.01, 0.1, 0.2, 0.3]
         momentum = [0.0, 0.2, 0.4, 0.6, 0.8, 0.9]
         # init_mode = ['uniform', 'glorot_uniform',  'glorot_normal', 'normal', 'zero']
         # activation = ['softmax', 'relu', 'tanh', 'sigmoid',  'linear']
         weight_constraint = [1, 2, 3, 4, 5]
         # dropout_rate = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
         param_grid = dict(batch_size=batch_size, epochs=epochs)
         grid = GridSearchCV(estimator=model, param_grid=param_grid, cv=3)
         grid_result = grid.fit(X, Y)
         best_score = grid_result.best_score_
         best_params = grid_result.best_params_
     except Exception as e:
         log('----------Error in updating Parameters ----------:{}'.format(e), 'error')
         raise e
     return best_score, best_params
Example #16
0
def home():
    log("HOME")
    # Pip package utils
    num = add(1, 2)

    # DB
    log("GETTING FROM DB")
    output = "dummy_db_output"
    try:
        conn = db.connection
        cur = conn.cursor()
        cur.execute("select * from test limit 1;")
        output = cur.fetchall()
    except Exception as e:
        log("ERROR DB")
        log(e)
        pass

    # Proto
    log("CALLING MICROSERVICE")
    channel = grpc.insecure_channel('{}:50051'.format(GRPC_HOST))
    stub = microservice_pb2_grpc.BlogServiceStub(channel)
    response = stub.GetBlog(microservice_pb2.GetBlogRequest())

    # Kafka -- not working on Kubernetes, comment out for simplicity
    # log("SENDING TO KAFKA")
    # future = producer.send(TOPIC, {"test": "hello"}).add_callback(on_send_success).add_errback(on_send_error)
    # result = future.get(timeout=5)
    # log("Result = {}".format(result))

    return jsonify({
        "db": str(output),
        # "topic": result.topic,
        "lib": num,
        "microservice": str(response.blog)
    })
Example #17
0
def get_all_url():
    full_list = []
    state_names = os.getenv('state')
    state_names = state_names.split(' ')

    for state in state_names:
        base_urls = get_state_url(state)
        pdf_urls = []

        rec_urls = []
        for start_url in base_urls:
            if "pdf" in start_url['url'].lower():
                pdf_urls.append(start_url)
            elif "corona" in start_url['url'].lower(
            ) or "covid" in start_url['url'].lower():
                if "cdc" in start_url['url'].lower():
                    full_list.append(start_url)
                else:
                    rec_urls.append(start_url)
            else:
                full_list.append(start_url)

        # log(len(rec_urls))
        for rec_url in rec_urls:
            urls = get_all_url_utils(rec_url)
            for url in urls:
                list_urls = get_all_url_utils(url)
                try:
                    for i in list_urls:
                        if i not in full_list:
                            # log("adding to list: " + i['url'])
                            full_list.append(i)
                except TypeError:
                    log('Whoops wrong content passed ' +
                        TypeError.with_traceback())
            # log(rec_url + " " + str(len(full_list)))
        for pdf_url in pdf_urls:
            full_list.append(pdf_url)
        log("Pdf count " + str(len(pdf_urls)))
        log(state + " " + str(len(full_list)))
    return full_list
Example #18
0
def delete_index(index):
    es.indices.delete(index=index, ignore=[400, 404])
    log("Deleted")
Example #19
0
def get_all_url_utils(url):
    # log("Requested url is: " + url['url'])
    home_page_url = url['url'].split('/')
    if len(home_page_url) > 1:
        home_page_url = home_page_url[0] + '//' + home_page_url[2]
    url_list = []
    try:
        resp = urllib.request.urlopen(url['url'])
        soup = BeautifulSoup(resp,
                             from_encoding=resp.info().get_param('charset'),
                             features="html.parser")
        for link in soup.find_all('a', href=True):
            if "corona" in link['href'].lower(
            ) or "covid" in link['href'].lower():
                if ignore_urls(link):
                    continue
                url_dict = {}
                if link['href'].startswith('http'):
                    url_dict = {
                        'url': link['href'],
                        'county': url['county'],
                        'contentType': url['contentType'],
                        'channel': url['channel']
                    }
                else:
                    url_dict = {
                        'url': home_page_url + link['href'],
                        'county': url['county'],
                        'contentType': url['contentType'],
                        'channel': url['channel']
                    }
                url_list.append(url_dict)
    except urllib.error.HTTPError:
        log("urllib error http error for " + url['url'])
        pass
    except urllib.error.URLError:
        log("url error for " + url['url'])
        pass
    except ValueError:
        log("value error for " + url['url'])
        pass
    except ConnectionResetError:
        log("connection reset error " + url['url'])
        pass
    except http.client.InvalidURL:
        log("http client invalid url for " + url['url'])
        pass
    except UnboundLocalError:
        log("unbound local error for " + url['url'])
        pass
    except http.client.IncompleteRead:
        log("http client incomplete read " + url['url'])
        pass
    except TypeError:
        log("Type error for " + url['url'])
        pass
    return url_list
Example #20
0
# import states

# For server
es = Elasticsearch([os.getenv('elastic_server_host')],
                   http_auth=(os.getenv('elastic_username'),
                              os.getenv('elastic_password')),
                   scheme="https",
                   port=os.getenv('elastic_port'),
                   verify_certs=False)

# For Azure Cloud
# es = Elasticsearch(os.getenv('elastic_azure_host') + os.getenv('elastic_azure_port'),
#                    http_auth=(os.getenv('elastic_azure_username'), os.getenv('elastic_azure_password')))

log(es.info())


def create_index_util(index):
    request_body = {
        "settings": {
            "index": {
                "analysis": {
                    "analyzer": {
                        "analyzer_shingle": {
                            "tokenizer": "standard",
                            "filter": ["lowercase", "filter_shingle"]
                        }
                    },
                    "filter": {
                        "filter_shingle": {
Example #21
0
def on_send_success(metadata):
    log("SUCCESSFULLY SENT TO KAFKA")
Example #22
0
def get_index_mapping(index):
    log(es.indices.get_mapping(index=index))
Example #23
0
def create_index_util(index):
    request_body = {
        "settings": {
            "index": {
                "analysis": {
                    "analyzer": {
                        "analyzer_shingle": {
                            "tokenizer": "standard",
                            "filter": ["lowercase", "filter_shingle"]
                        }
                    },
                    "filter": {
                        "filter_shingle": {
                            "type": "shingle",
                            "max_shingle_size": 4,
                            "min_shingle_size": 2,
                            "output_unigrams": "true"
                        }
                    }
                }
            }
        },
        "mappings": {
            "properties": {
                "channel": {
                    "type": "keyword"
                },
                "contentType": {
                    "type": "keyword"
                },
                "contentdiff": {
                    "type": "text",
                    "fields": {
                        "keyword": {
                            "type": "keyword",
                            "ignore_above": 256
                        }
                    }
                },
                "county": {
                    "type": "keyword"
                },
                "createdatetime": {
                    "type": "text"
                },
                "htmldiff": {
                    "type": "text"
                },
                "state": {
                    "type": "keyword"
                },
                "url": {
                    "type": "text"
                },
                "title": {
                    "type": "keyword"
                },
                "previewText": {
                    "type": "nested",
                    "properties": {
                        "title": {
                            "type": "text"
                        },
                        "description": {
                            "type": "text"
                        },
                        "image": {
                            "type": "text"
                        },
                        "website": {
                            "type": "text"
                        }
                    }
                }
            }
        },
    }

    log("Creating Index")
    es.indices.create(index=index, body=request_body)
    log("Index Created")
Example #24
0
            urls = get_all_url_utils(rec_url)
            for url in urls:
                list_urls = get_all_url_utils(url)
                try:
                    for i in list_urls:
                        if i not in full_list:
                            # log("adding to list: " + i['url'])
                            full_list.append(i)
                except TypeError:
                    log('Whoops wrong content passed ' +
                        TypeError.with_traceback())
            # log(rec_url + " " + str(len(full_list)))
        for pdf_url in pdf_urls:
            full_list.append(pdf_url)
        log("Pdf count " + str(len(pdf_urls)))
        log(state + " " + str(len(full_list)))
    return full_list


if __name__ == "__main__":
    urls = get_all_url()
    # url = {
    #     "url": "www.coronavirus.kdheks.gov",
    #     "county": "hennepin",
    #     "contentType": "provider",
    #     "channel": "url"
    # }
    # urls = get_all_url_utils(url)
    for url in urls:
        log(url)
Example #25
0
def find_change(content):
    log("job started")
    # urls = [{"url":"https://www.denvergov.org/content/dam/denvergov/Portals/771/documents/covid-19/FaceCoveringRequired_site_11x17.pdf"}]
    urls = get_all_url()
    log("total urls to be crawled " + str(len(urls)))

    html_diff = difffile.HtmlDiff(tabsize=4, wrapcolumn=80)

    connection_error_urls = []
    could_not_retrive_urls = []
    stop_iteration_urls = []
    new_urls = []

    for url in urls:
        pdf_content = False
        # log("going to " + url['url'])

        requests.packages.urllib3.util.ssl_.DEFAULT_CIPHERS += 'HIGH:!DH:!aNULL'
        try:
            requests.packages.urllib3.contrib.pyopenssl.DEFAULT_SSL_CIPHER_LIST += 'HIGH:!DH:!aNULL'
        except AttributeError:
            # no pyopenssl support used / needed / available
            pass

        try:
            req = requests.get(url['url'])
        except requests.exceptions.ConnectionError:
            connection_error_urls.append(url)
            continue
        except requests.exceptions.TooManyRedirects:
            log("Too many redirects for: " + url['url'])
            continue
        except requests.exceptions.ChunkedEncodingError:
            log("Chunked encoding error " + url['url'])
            continue
        except requests.exceptions.InvalidURL:
            log("requests exception invalid url" + url['url'])
            continue
        except requests.exceptions.MissingSchema:
            log("request exception missing schema: " + url['url'])
            continue
        except UnicodeError:
            log("unicode error for " + url['url'])
            continue
        except requests.exceptions.InvalidSchema:
            log("requests exceptions invalid schema " + url['url'])
            continue

        if req.status_code in [200]:
            try:
                if req.headers['Content-Type'].split(
                        ';')[0] == 'application/pdf':
                    pdf_content = True
                    html = read_from_url(
                        url['url'])  # still keeping the variable name as html
                    if html == "failed":
                        log("pdf ocr failed for url - " + url['url'])
                        continue
                else:
                    html = req.text
            except KeyError:
                log("Key error for " + url['url'])

        else:
            could_not_retrive_urls.append(url)
            html = None
            continue

        if url['url'] not in content.keys():
            content[url['url']] = html
            new_urls.append(url)
        else:
            old_html = content[url['url']]
            current_html = html
            content[url['url']] = current_html

            old_html_body = old_html
            current_html_body = current_html

            if not pdf_content:
                try:
                    current_html = BeautifulSoup(
                        current_html.encode('utf-8').decode('ascii', 'ignore'),
                        "html.parser")
                except UnboundLocalError:
                    log("unbound local error for " + url['url'])
                    continue
                except TypeError:
                    log("type error for " + url['url'])
                    continue
                [
                    s.extract()
                    for s in current_html.findAll(['script', 'style'])
                ]
                try:
                    current_html_body = current_html.find('body').text
                except AttributeError:
                    log("attribute error for " + url['url'])
                    continue

                old_html = BeautifulSoup(
                    old_html.encode('utf-8').decode('ascii', 'ignore'),
                    "html.parser")
                [s.extract() for s in old_html.findAll(['script', 'style'])]
                if not old_html.find('body'):
                    continue
                old_html_body = old_html.find('body').text

            md5_1 = hashlib.md5()

            try:
                md5_1.update(old_html_body.encode('utf-8'))
            except AttributeError:
                log("Attribute error " + url['url'])
                continue
            hash_1 = md5_1.hexdigest()

            md5_2 = hashlib.md5()
            md5_2.update(current_html_body.encode('utf-8'))
            hash_2 = md5_2.hexdigest()

            if hash_1 == hash_2:
                pass
            else:
                if not pdf_content:
                    parsed_curr_html = BeautifulSoup(
                        html.encode('utf-8').decode('ascii', 'ignore'),
                        "html.parser")

                    title = ''
                    if parsed_curr_html.find('head'):
                        if parsed_curr_html.head.find('title'):
                            title = parsed_curr_html.head.find('title').text
                    try:
                        html_diffs = html_diff.make_file(
                            old_html_body.splitlines(),
                            current_html_body.splitlines(),
                            '<h2><a href= %s>%s</a></h2>' %
                            ("\"" + url['url'] + "\"", title),
                            "",
                            context=True,
                            numlines=3)
                    except StopIteration:
                        log("stop iteration for " + url['url'])
                        stop_iteration_urls.append(url)
                        continue
                    except RecursionError:
                        log("recursion error for " + url['url'])
                        continue

                else:

                    title = url['url']

                    try:
                        html_diffs = html_diff.make_file(
                            old_html.splitlines(),
                            current_html.splitlines(),
                            '<h2><a href= %s>%s</a></h2>' %
                            ("\"" + url['url'] + "\"", title),
                            "",
                            context=True,
                            numlines=3)
                    except StopIteration:
                        stop_iteration_urls.append(url)
                        continue

                diff_json = get_diff_json(html_diffs, url['url'])

                if len(diff_json) > 0:
                    title = ""
                    description = ""
                    image = ""
                    website = ""

                    if not pdf_content:
                        try:
                            dict_elem = link_preview.generate_dict(url['url'])
                        except urllib.error.HTTPError:
                            log("urllib error http error " + url['url'])
                            continue
                        except UnicodeEncodeError:
                            log("unicode encode error " + url['url'])
                            continue
                        except IndexError:
                            log("index error for " + url['url'])
                            continue
                        except http.client.InvalidURL:
                            log("http client invalid url " + url['url'])
                            continue
                        except UnicodeDecodeError:
                            log("unicode decode error " + url['url'])
                            continue
                        except urllib.error.URLError:
                            log("urllib error url error " + url['url'])
                            continue
                        except ConnectionResetError:
                            log("connection reset error " + url['url'])
                            continue
                        title = dict_elem['title']
                        description = dict_elem['description']
                        image = dict_elem['image']
                        website = dict_elem['website']

                    html_diffs += ('<br><br>')  # ADD These
                    html_diffs += ('page link - <a href= %s>%s</a></h2>' %
                                   ("\"" + url['url'] + "\"", url['url']))
                    html_diffs += ('<br><br>')
                    # log(html_diffs)
                    index = "state_" + os.getenv('state').lower().replace(
                        ' ', '_')
                    state = os.getenv('state')
                    page_url = url['url']
                    htmldiff = html_diffs
                    createdatetime = datetime.now()
                    county = url['county']
                    contentType = url['contentType']
                    channel = url['channel']
                    contentdiff = ""
                    preview = {
                        "title": title,
                        "description": description,
                        "image": image,
                        "website": website
                    }
                    # log(preview)

                    document = {
                        "state": state,
                        "channel": channel,
                        "contentType": contentType,
                        "contentdiff": contentdiff,
                        "county": county,
                        "createdatetime": createdatetime,
                        "htmldiff": htmldiff,
                        "state": state,
                        "url": page_url,
                        "title": title,
                        "previewText": preview
                    }
                    feed_documents(index, document)
                    for json in diff_json:
                        feed_nlp_document("nlp_data", json)
    log("job finished")
Example #26
0
                        "contentType": contentType,
                        "contentdiff": contentdiff,
                        "county": county,
                        "createdatetime": createdatetime,
                        "htmldiff": htmldiff,
                        "state": state,
                        "url": page_url,
                        "title": title,
                        "previewText": preview
                    }
                    feed_documents(index, document)
                    for json in diff_json:
                        feed_nlp_document("nlp_data", json)
    log("job finished")


log("crawler started")
first_run = False
find_change(content)
first_run = True

schedule.every().day.at("13:00").do(find_change, content)
schedule.every().day.at("01:00").do(find_change, content)
log("schedule started")
while True:
    schedule.run_pending()
    time.sleep(1)

# find_change(content)
# find_change(content)
Example #27
0
                             },
                             auth=(os.getenv('elastic_username'),
                                   os.getenv('elastic_password')),
                             verify=False).json()

    # for azure
    # response = requests.post(url,
    #                          data=payload,
    #                          headers={'Content-Type': 'application/json'},
    #                          auth=(os.getenv('elastic_azure_username'), os.getenv('elastic_azure_password')),
    #                          verify=False).json()

    # log(json.dumps(response, indent=2, sort_keys=True))
    hits = response["hits"]["hits"]
    source = hits[0]["_source"]
    state_urls = source["state_urls"]
    urls = []
    for state_url in state_urls:
        # urls.append(state_url["url"])
        if "facebook" in state_url["url"] or "twitter" in state_url["url"]:
            continue
        urls.append(state_url)
    # log(len(urls))
    return urls


if __name__ == "__main__":
    state_urls = get_state_url("pennsylvania")
    for state_url in state_urls:
        log(state_url)
Example #28
0
def on_send_error(excp):
    log("ERROR AFTER SENT TO KAFKA")
Example #29
0
def consumer_notify_success():
    log("SUCCESSFULLY CONSUMED BY CONSUMER")
    # If you trigger an emit in a regular function, it will default to a broadcast
    socketio.emit('isConsumerWorking',
                  {'data': 'triggered consumer_notify_success'})
Example #30
0
def search_index(index):
    results = es.search(body={"query": {"match_all": {}}}, index=index)
    for each in results['hits']['hits']:
        each = each['_source']['Url']
        log(each)