def ImportDB(workspace): importScript = "/Users/amitrou/Documents/CodeProjects/historical_data/restore-db.sh" if not os.path.exists(importScript): raise Exception("Script doesn't exist") return settingsPath = os.path.join(workspace, "import_settings.json") if not os.path.exists(settingsPath): raise Exception("Configuration file doen't exist") return import_settings = Utilities.ReadJSON(settingsPath) if import_settings == None: raise Exception("Error") server = import_settings['host'] port = import_settings['port'] user = import_settings['user'] dump_list = import_settings["dump_list"] if len(dump_list) == 0: raise Exception("No contents") for item in dump_list: dump_file = item["dupm_file"] db_name = item["db_name"] if (dump_file != None) and (db_name != None): Utilities.ExecuteShellScript(importScript, server, port, user, db_name, dump_file)
def __heartbeat_cb(self, timer): self.__heartbeat_counter += 1 if self.__heartbeat_counter >= Settings.MQTT_KEEPALIVE: try: self.__client.publish( b'{}/ping'.format(Settings.MQTT_USERNAME), b'ping') self.__heartbeat_counter = 0 except OSError as ose: err_msg = str(ose) print("err time:", time()) print(err_msg) if err_msg in ("[Errno 104] ECONNRESET", "-1"): try: self.__client.disconnect() except OSError: pass finally: self.__client.connect() elif err_msg == "[Errno 113] EHOSTUNREACH": Utilities.hard_reset() gc.collect()
def __init__(self): self.data_file = app_config['data_file'] self.texts_in_file = 'texts_in_file.txt' self.ner_texts_file = 'output.txt' self.utilities = Utilities() self.lemmatizer = WordNetLemmatizer() self.preprocessor = Preprocessor( ['remove_urls', 'remove_mentions', 'remove_hashtags', 'normalize'])
def __init__(self): self.path_to_jar = 'lib/stanford_parser/stanford-parser.jar' self.path_to_models_jar = 'lib/stanford_parser/stanford-english-corenlp-2018-02-27-models.jar' self.path_to_ner_tagger = 'lib/stanford_ner/stanford-ner.jar' self.path_to_ner_model = 'lib/stanford_ner/english.all.3class.distsim.crf.ser.gz' self.ner_tagger = StanfordNERTagger(self.path_to_ner_model, self.path_to_ner_tagger) self.dependency_parser = StanfordDependencyParser( path_to_jar=self.path_to_jar, path_to_models_jar=self.path_to_models_jar) self.lemmatizer = WordNetLemmatizer() self.utilities = Utilities()
def getEHRClasses(patientXML, children=True, parents=True, duplicates=False): if ( duplicates ): ehrClasses = Utilities.getXMLElements(patientXML, {}, children, parents, duplicates); allValues = []; for depth in ehrClasses: allValues += ehrClasses[depth]; return allValues; else: # Combines all values in dictionary of EHR depths. return [element.tag for element in set(set().union(*list(Utilities.getXMLElements(patientXML, {}, children, parents, duplicates).values())))];
def centroidError(): centroid1 = Body25.getCentroid(keypoint1) centroid2 = Body25.getCentroid(keypoint2) dist = Ut.distance(centroid1 / frame, centroid2 / frame, ignore_zero_vector=True) if np.isnan(dist): return np.inf return dist
def getEHRClassChildren(patientXML, ehrClass, children=True, parents=False, allEHRChildren=False, contextualiseChildren=True, removeGerunds=True): ehrClassChildren = {}; for ehrClassExample in patientXML.findall(".//" + ehrClass): ehrClassExampleDepthsToChildren = Utilities.getXMLElements(ehrClassExample, {}, children, parents, False, True, True); if 0 in list(ehrClassExampleDepthsToChildren.keys()): for element in ehrClassExampleDepthsToChildren[0]: # Contextualise those EHR children that do not give enough context on their own, because they are just generic children. if ( contextualiseChildren and element.tag.lower() in TranslationConstants.FIELDS_THAT_INDICATE_RESOURCE_CAN_HOLD_ANY_DATA ): # Work out how to present this new compound child (child + parent name), based on which separators are used by this EHR. if ( TranslationConstants.SEPARATOR != "" ): element.tag = ehrClass + TranslationConstants.SEPARATOR + element.tag; else: element.tag = ehrClass[0].upper() + ehrClass[1:] + element.tag; ehrClassChildren.setdefault(ehrClass, []).extend([element.tag]); # If an EHR word begins with a gerund (such as 'Managing' in 'ManagingOrganisation'), this potentially complicates the context of the word, and so should be accounted for. Remove gerunds AND add the gerund free version as an additional EHR child. if ( removeGerunds ): ehrClassChildren.setdefault(ehrClass, []).extend([TranslationUtilities.removeGerund(element.tag)]); # As we may have multiple examples of an EHR class in an example piece of marked up data from an EHR vendor, we want to find all possible examples of children that can be listed under that class. if ( not allEHRChildren ): break; return ehrClassChildren;
def getFHIRClassesToChildren(fhirClasses=TranslationUtilities.getFHIRClasses(), linkedClasses=True, fhirClassesRecurse=False, selectiveRecurse=TranslationConstants.SELECTIVE_RECURSE, includesBackboneElements=True, mergeMainChildrenWithBackboneChildren=True): fhirClassesToChildren = {}; if (includesBackboneElements and mergeMainChildrenWithBackboneChildren): for fhirClassAndBackboneElements in fhirClasses: fhirClass = fhirClassAndBackboneElements[0]; for fhirClassOrBackboneElement in fhirClassAndBackboneElements: children = TranslationUtilities.getFHIRClassChildren(fhirClassOrBackboneElement, linkedClasses, fhirClassesRecurse, selectiveRecurse); if ( children != None ): fhirClassesToChildren.setdefault(fhirClass, []).extend(children) else: if ( not mergeMainChildrenWithBackboneChildren ): fhirClasses = Utilities.mergeListOfLists(fhirClasses); for fhirClass in fhirClasses: children = TranslationUtilities.getFHIRClassChildren(fhirClass, linkedClasses, fhirClassesRecurse, selectiveRecurse); if ( children != None ): fhirClassesToChildren[fhirClass] = children; return fhirClassesToChildren;
def removeGerund(ehrChild): separatedElementTag = Utilities.listFromCapitals(ehrChild); if ( len(separatedElementTag) > 1 ): taggedSeperatedElementTag = nltk.pos_tag(separatedElementTag); separatedElementTag = [tag[0] for tag in taggedSeperatedElementTag if "VBG" not in tag[1]] return "".join(separatedElementTag); return ehrChild;
def morphologicalSimilarity(ehrAttribute, fhirAttribute, lemmaSimilarityThreshold=TranslationConstants.MORPHOLOGICAL_SIMILARITY_THRESHOLD): if SimilarityMetrics.textMatch(ehrAttribute, fhirAttribute): return 1; highestSimilarity = 0; for lemma in Utilities.lemmas(ehrAttribute): if SimilarityMetrics.textSimilarity(lemma, fhirAttribute, True) > highestSimilarity and SimilarityMetrics.textMatch(lemma, fhirAttribute, True, lemmaSimilarityThreshold): highestSimilarity = SimilarityMetrics.textSimilarity(lemma, fhirAttribute, True); return highestSimilarity;
def compositeStringSimilarity(ehrClassField, fhirClassField, comparisonMethod, comparisonMethodArgs=[], highestResult=True, removeStopwords=True): if ( comparisonMethod(ehrClassField, fhirClassField, *comparisonMethodArgs) == 1 ): return 1; # If ehrClass string is composite, compare each word with the FHIR target using all of the metrics, and then use chosen combination method to produce a value, e.g. for each word, add these values, and then divide by number of words to get an average match across all words or return highest. highestSimilarity = 0; highestSimilarityWord = ""; totalSimilarity = 0; ehrWords = Utilities.listFromCapitals(ehrClassField); fhirWords = Utilities.listFromCapitals(fhirClassField); if (removeStopwords): ehrWords = [word for word in ehrWords if word.lower() not in stopwords.words('english')]; for ehrWord in ehrWords: highestSimilarityForEHRWord = 0; for fhirWord in fhirWords: similarity = comparisonMethod(ehrWord, fhirWord, *comparisonMethodArgs); if ( similarity > highestSimilarity ): highestSimilarity = similarity; highestSimilarityWord = ehrWord; if ( similarity > highestSimilarityForEHRWord ): highestSimilarityForEHRWord = similarity; totalSimilarity += highestSimilarityForEHRWord; if ( highestResult and len(highestSimilarityWord) > TranslationConstants.LENGTH_TO_IGNORE_IN_COMPOSITE_HIGHEST ): return highestSimilarity; else: return old_div(totalSimilarity, max(float(len(ehrWords)), float(len(fhirWords))));
def __init__(self, params=list()): self.remove_urls = True if 'remove_urls' in params else False self.remove_mentions = True if 'remove_mentions' in params else False self.remove_hashtags = True if 'remove_hashtags' in params else False self.normalize = True if 'normalize' in params else False self.remove_stopwords = True if 'remove_stopwords' in params else False self.remove_punct = True if 'remove_punctuation' in params else False self.lower = True if 'lower' in params else False self.lemmatize = True if 'lemmatize' in params else False self.stemming = True if 'stemming' in params else False self.remove_non_letters = True if 'remove_non_letters' in params else False self.lemmatizer = WordNetLemmatizer() self.stemmer = PorterStemmer() self.utilities = Utilities()
def get_evaluation_data(self, dataset_file, n_pair): utilities = Utilities() preprocessor = Preprocessor( ['remove_stopwords', 'remove_non_letters', 'lemmatize']) data_rows = utilities.read_from_csv(dataset_file) del data_rows[0] X = [] y = [] for data_row in data_rows[:n_pair]: candidate_causal_pair = eval(data_row[2]) label = 1 if data_row[3] == 'causal' else 0 candidate_causal_phrase = preprocessor.preprocess( candidate_causal_pair[0]) candidate_effect_phrase = preprocessor.preprocess( candidate_causal_pair[1]) if len(candidate_causal_phrase) > 0 and len( candidate_effect_phrase) > 0: X.append((candidate_causal_pair[0], candidate_causal_pair[1])) y.append(label) return X, y
def distanceError(): sum_dist = 0 num = 0 for part in Body25.Parts: coord1 = Body25.getCoordinates(keypoint1, part) coord2 = Body25.getCoordinates(keypoint2, part) dist = Ut.distance(coord1 / frame, coord2 / frame, ignore_zero_vector=True) if np.isnan(dist): continue else: sum_dist += dist num += 1 if num == 0: return np.inf else: return sum_dist / num
def test_TestMorphological(self): # e.g. self.assertTrue(Matches.match("PostCode", "postalCode")); total = 0 matched = 0 for key, value in usToGB.items(): lemmas = list(Utilities.lemmas(value)) if (lemmas): total += 1 shuffle(lemmas) if (Matches.matches(value, lemmas[0])): matched += 1 else: print(str(value) + " " + str(lemmas[0])) matchPercentage = matched / float(total) self.assertTrue(matchPercentage > 0.90)
def run_script(browser='chrome'): try: driver = Utilities.create_webdriver_instance(browser=browser) driver.get(WEBSITE_URL) TaskFour.select_random_video_on_homepage(driver) TaskFour.wait_for_ad_to_complete(driver) TaskFour.move_progress_bar_to_some_position(driver) except Exception: # TODO: Remove broad exceptions # TODO: Add logging # TODO: Take screenshot traceback.print_exc() finally: driver.quit() print('Script Complete')
def run_script(browser='chrome'): try: driver = Utilities.create_webdriver_instance(browser=browser) TaskTwo.search_for_flights(driver) TaskTwo.wait_for_flight_results_page_to_load(driver) TaskTwo.select_first_view_deal(driver) TaskTwo.select_second_view_deal(driver) except Exception: # TODO: Remove broad exceptions # TODO: Add logging # TODO: Take screenshot traceback.print_exc() finally: driver.quit() print('Script Complete')
def __data_timer_cb(self, timer): value = self.get_temperature() print("current temperature: {} ℃".format(value)) try: self.__publish_data(value) except OSError as ose: err_msg = str(ose) if err_msg == "-1": pass elif err_msg == "[Errno 113] EHOSTUNREACH": Utilities.hard_reset() else: Utilities.log(self.__data_timer_cb, err_msg, self.__log_callback) except Exception as e: err_msg = str(e) Utilities.log(self.__data_timer_cb, err_msg, self.__log_callback)
def __msg_timer_cb(self): while self.__starting: try: self.__mqtt_client.wait_msg() except OSError as ose: err_msg = str(ose) if err_msg == "-1": pass elif err_msg == "[Errno 113] EHOSTUNREACH": Utilities.hard_reset() else: Utilities.log(self.__msg_timer_cb, err_msg, self.__log_callback) # raise OSError(err_msg) except Exception as e: err_msg = str(e) Utilities.log(self.__msg_timer_cb, err_msg, self.__log_callback) gc.collect()
def do_job(articles, tokens, causal_net_generator): causal_pair_tokens = causal_net_generator.get_all_causal_pair_tokens( articles) tokens += causal_pair_tokens if __name__ == '__main__': start = time.time() print("\nJob started at %s" % datetime.fromtimestamp(start).strftime('%Y-%m-%d %H:%M:%S')) causal_net_generator = CausalNetGenerator() causal_net_generator_from_news = CausalNetGeneratorFromNews() multi_word_causal_net_generator_from_news = MultiWordCausalNetGeneratorFromNews( ) utilities = Utilities() manager = Manager() ## Generate causal net from wikipedia articles tokens = manager.list() num_threads = cpu_count() - 1 number = 1000000 offset = 0 print("Number: %d and offset %d" % (number, offset)) graph_path = 'causal_net.pickle' articles = causal_net_generator.get_articles(number=number, offset=offset) dispatch_jobs(articles, num_threads, tokens, causal_net_generator)
def semanticSimilarity(ehrAttribute, fhirAttribute, useDefinition=False, alsoUseMorphologicalSimilarity=False, morphologicalSimilarityThreshold=TranslationConstants.MORPHOLOGICAL_SIMILARITY_THRESHOLD, compositeSynonyms=False, highestResult=True ): # If these attributes would be associated via a text match instead, then don't also reevaluate their similarity via the text similarity below. if SimilarityMetrics.textMatch(ehrAttribute, fhirAttribute, False): return 0; highestSimilarity = 0; # wordnet requires word separation by underscore, whereas EHR XML responses (for TPP at least) use camelCase (this won't be an issue if used with composite string similarity, where only one word is used at a time). for set in wordnet.synsets(Utilities.capitalToSeparation(ehrAttribute)): synonyms = set.lemma_names(); if useDefinition: setType = set.pos(); associatedSynonyms = []; if ( set not in SimilarityMetrics.synsetToDefinitionTerms ): # We also include words from the definition of this word, that are of the same grammatical type (e.g. noun or verb), as potential synonyms. for word in set.definition().split(" "): if ( len(word) <= 3 or word in associatedSynonyms or "." in word ): continue; if ( word not in SimilarityMetrics.wordsToTypes ): wordSynset = wordnet.synsets(word); if not len(wordSynset): continue; # Find most popular interpretation of this word, so can find right grammatical form. chosenSynset = wordSynset[0]; highestLemmaPopularity = 0; for set in wordSynset: for lemma in set.lemmas(): if lemma.count() > highestLemmaPopularity: highestLemmaPopularity = lemma.count(); chosenSynset = set; SimilarityMetrics.wordsToTypes[word] = chosenSynset.pos(); if ( SimilarityMetrics.wordsToTypes[word] == setType ): associatedSynonyms.append(word); SimilarityMetrics.synsetToDefinitionTerms[set] = associatedSynonyms; synonyms = synonyms + SimilarityMetrics.synsetToDefinitionTerms[set]; for synonym in synonyms: # Do we want the highest value across all components of the synonym, or just the synonym directy. if ( compositeSynonyms ): textSimilarity = SimilarityMetrics.compositeStringSimilarity(Utilities.separationToCapital(synonym), fhirAttribute, SimilarityMetrics.textSimilarity, [], highestResult); else: textSimilarity = SimilarityMetrics.textSimilarity(Utilities.separationToCapital(synonym), fhirAttribute); # Synonyms may also be grammatical variants as opposed to just text matches. if ( alsoUseMorphologicalSimilarity ): if ( compositeSynonyms ): morphologicalSimilarity = SimilarityMetrics.compositeStringSimilarity(Utilities.separationToCapital(synonym), fhirAttribute, SimilarityMetrics.morphologicalSimilarity, [morphologicalSimilarityThreshold], highestResult); else: morphologicalSimilarity = SimilarityMetrics.morphologicalSimilarity(synoynm, fhirAttribute); else: morphologicalSimilarity = 0; # Get similarity between synonym for ehrAttribute and fhirAttribute (not synonyms that are the ehr attribute itself). If this is over a given threshold, AND it is greater than previously marked highest values, update highest similarity. if not SimilarityMetrics.textSimilarity(synonym, ehrAttribute) == 1.0 and max(textSimilarity, morphologicalSimilarity) > highestSimilarity: highestSimilarity = max(textSimilarity, morphologicalSimilarity); return highestSimilarity;
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score from nltk.corpus import wordnet from utils.utilities import Utilities from causality_detection.causal_stength_calculator import CausalStrengthCalculator from preprocessing.preprocesssor import Preprocessor if __name__ == '__main__': causal_strength_calculator = CausalStrengthCalculator() utilities = Utilities() preprocessor = Preprocessor( ['remove_stopwords', 'remove_non_letters', 'lemmatize']) dataset_file = 'causal_pairs_dataset_old.csv' data_rows = utilities.read_from_csv(dataset_file) del data_rows[0] X = [] y_true = [] y_pred = [] threshold = 10 for data_row in data_rows[:10]: candidate_causal_pair = eval(data_row[2]) label = 1 if data_row[3] == 'causal' else 0 candidate_causal_phrase = preprocessor.preprocess( candidate_causal_pair[0]) candidate_effect_phrase = preprocessor.preprocess( candidate_causal_pair[1]) if len(candidate_causal_phrase) > 0 and len(
class EventExtractor: def __init__(self): self.data_file = app_config['data_file'] self.texts_in_file = 'texts_in_file.txt' self.ner_texts_file = 'output.txt' self.utilities = Utilities() self.lemmatizer = WordNetLemmatizer() self.preprocessor = Preprocessor( ['remove_urls', 'remove_mentions', 'remove_hashtags', 'normalize']) # jar_files = os.path.join(os.path.dirname(__file__), 'jars') # self.sutime = SUTime(jars=jar_files, mark_time_ranges=True) def save_texts_in_file(self): items = self.utilities.read_from_csv(self.data_file) header = items[0] texts = [item[header.index('text')] for item in items[1:]] processed_texts = [ self.preprocessor.preprocess(text).encode('utf8') for text in texts ] self.utilities.save_list_as_text_file(processed_texts, self.texts_in_file) def prepare_phrases(self, matches, tag, token_position=0, tag_position=-1, splitter='/'): phrases = [] phrase = '' for match in matches: match_components = match.split(splitter) text_token = match_components[token_position].lower().strip() event_tag = match_components[tag_position] if event_tag == 'B-' + tag and len(phrase) < 1: phrase += text_token elif event_tag == 'B-' + tag and len(phrase) > 0: phrases.append(phrase) phrase = text_token else: phrase += ' ' + text_token phrases.append(phrase) phrases = list(set(phrases)) return phrases def get_event_phrases(self, text): tag_name = 'EVENT' matches = re.findall(r'\w+/O/[A-Z]+/[BI]-' + tag_name, text) phrases = self.prepare_phrases(matches, tag_name) joined_text = ', '.join(phrases) if len(phrases) > 0 else '' return joined_text def get_event_locations(self, text): tag_name = 'geo-loc' matches = re.findall(r'\w+/[BI]-' + tag_name + '/[A-Z]+/O', text) phrases = self.prepare_phrases(matches=matches, tag=tag_name, token_position=0, tag_position=1) joined_text = ', '.join(phrases) if len(phrases) > 0 else '' return joined_text def get_event_entities(self, text): tag_names = [ 'person', 'company', 'facility', 'product', 'band', 'sportsteam', 'movie', 'tv-show' ] phrases = [] for tag_name in tag_names: matches = re.findall(r'\w+/[BI]-' + tag_name + '/[A-Z]+/O', text) if len(matches) > 0: phrases += self.prepare_phrases(matches=matches, tag=tag_name, token_position=0, tag_position=1) joined_text = ', '.join(phrases) if len(phrases) > 0 else '' return joined_text def extract_events(self): data_rows = self.utilities.read_from_csv(self.data_file) text_rows = self.utilities.read_lines_from_file(self.ner_texts_file) header = data_rows[0] del data_rows[0] events = [] unique_texts = [] for data_row, text_row in zip(data_rows, text_rows): text = self.preprocessor.preprocess(data_row[header.index('text')]) if text in unique_texts: continue event = { 'tweet_id': data_row[header.index('id')], 'entities': self.get_event_entities(text_row), 'locations': self.get_event_locations(text_row), 'event_time': data_row[header.index('created_at')], 'event_phrases': self.get_event_phrases(text_row), } events.append(event) unique_texts.append(text) return events def extract_events_from_stanford_dependencies(self, dependencies, ner_tags): entity_categories = ['PERSON', 'LOCATION', 'ORGANIZATION'] raw_events = {} for dependency in dependencies: if len(dependency) == 3: head = dependency[0] relation = dependency[1] tail = dependency[2] if head[1].startswith('VB'): event_keywords = list(raw_events.keys()) event_keyword = self.lemmatizer.lemmatize( head[0].lower(), 'v') if event_keyword not in event_keywords: raw_events[event_keyword] = {} if relation.endswith('subj'): subject_pronoun = [ 'i', 'you', 'he', 'she', 'we', 'they', 'who' ] subj_value = self.lemmatizer.lemmatize(tail[0].lower()) if tail[0].lower() in subject_pronoun: subj_value = 'PERSON' else: for ner_tag in ner_tags: if ner_tag[0] == tail[0] and ner_tag[ 1] in entity_categories: subj_value = ner_tag[1] raw_events[event_keyword]['subj'] = subj_value if relation == 'dobj': objective_pronoun = [ 'me', 'you', 'him', 'her', 'us', 'you', 'them' ] dobj_value = self.lemmatizer.lemmatize(tail[0].lower()) if tail[0].lower() in objective_pronoun: dobj_value = 'PERSON' else: for ner_tag in ner_tags: if ner_tag[0] == tail[0] and ner_tag[ 1] in entity_categories: dobj_value = ner_tag[1] raw_events[event_keyword]['dobj'] = dobj_value if relation == 'compound:prt': raw_events[event_keyword]['prt'] = tail[0] events = [] for verb in list(raw_events.keys()): event = raw_events[verb] if len(verb) < 2 or 'subj' not in list(event.keys()) or len(event['subj']) < 2 \ or 'dobj' not in list(event.keys()) or len(event['dobj']) < 2: continue event['keyword'] = verb events.append(event) return events def get_unique_tweets(self, n_rows=None): data_rows = self.utilities.read_from_csv(self.data_file) preprocessor = Preprocessor([ 'remove_urls', 'remove_mentions', 'remove_hashtags', 'normalize', 'remove_non_letters' ]) header = data_rows[0] del data_rows[0] tweet_rows = {} for data_row in data_rows: if n_rows is not None and len(tweet_rows) >= n_rows: break tweet = preprocessor.preprocess(data_row[header.index('text')]) if tweet not in list(tweet_rows.keys()): tweet_rows[tweet] = data_row tweet_rows = [header] + list(tweet_rows.values()) return tweet_rows def get_tweet_sentences(self, tweet_rows): header = tweet_rows[0] del tweet_rows[0] tweet_sentences = [] for tweet_row in tweet_rows: created_at = tweet_row[header.index('created_at')] text = self.preprocessor.preprocess( tweet_row[header.index('text')]) sentences = sent_tokenize(text) for sentence in sentences: if len(sentence) > 1: tweet_sentences.append((created_at, sentence)) return tweet_sentences def extract_events2(self, tweet_sentences): path_to_jar = 'lib/stanford_parser/stanford-parser.jar' path_to_models_jar = 'lib/stanford_parser/stanford-english-corenlp-2018-02-27-models.jar' path_to_ner_tagger = 'lib/stanford_ner/stanford-ner.jar' path_to_ner_model = 'lib/stanford_ner/english.all.3class.distsim.crf.ser.gz' sentence_preprocessor = Preprocessor(['remove_non_letters']) ner_tagger = StanfordNERTagger(path_to_ner_model, path_to_ner_tagger) dependency_parser = StanfordDependencyParser( path_to_jar=path_to_jar, path_to_models_jar=path_to_models_jar) events = [] chunks = list( self.utilities.chunkify_list(data_list=tweet_sentences, items_per_chunk=1000)) for chunk in chunks: created_ats = [] sentences = [] for chunk_item in chunk: created_ats.append(chunk_item[0]) sentences.append( sentence_preprocessor.preprocess(chunk_item[1])) chunk_sent_dependencies = dependency_parser.raw_parse_sents( sentences) chunk_sent_ner_tags = ner_tagger.tag_sents( [sentence.split() for sentence in sentences]) for sent_dependencies, sent_ner_tags, created_at in zip( chunk_sent_dependencies, chunk_sent_ner_tags, created_ats): dependencies = [ list(parse.triples()) for parse in sent_dependencies ] if len(dependencies) > 0 and dependencies[0] is not None: sentence_events = self.extract_events_from_stanford_dependencies( dependencies[0], sent_ner_tags) if len(sentence_events) > 0: for sentence_event in sentence_events: events.append((created_at, sentence_event)) return events def chunkify_events_by_timeslots(self, events, duration): slot_starts_at = None event_chunks = [] event_chunk = [] for event in events: created_at = datetime.strptime(event[0], '%d-%m-%Y %H:%M') if slot_starts_at is None: slot_starts_at = created_at if len(event_chunk ) > 0 and created_at > slot_starts_at + timedelta( 0, duration): event_chunks.append(event_chunk) event_chunk = [] slot_starts_at = created_at event_chunk.append(event) event_chunks.append(event_chunk) return event_chunks
import timeit import collections from utils.utilities import Utilities from preprocessing.preprocesssor import Preprocessor from causality_detection.causal_stength_calculator import CausalStrengthCalculator from causality_detection.itemsest_causality import ItemsetCausality if __name__ == "__main__": start_time = timeit.default_timer() event_file_path = 'events.csv' utilities = Utilities() causal_strength_calculator = CausalStrengthCalculator() itemset_causality = ItemsetCausality() preprocessor = Preprocessor(params=['lower', 'lemmatize']) rows = utilities.read_from_csv(event_file_path) header = rows[0] del rows[0] events_phrases = [] for row in rows: phrases = [phrase.strip() for phrase in row[header.index('event_phrases')].split(',')] events_phrases += phrases sorted_event_phrases = collections.Counter(events_phrases).most_common() low_freq_events = [event[0] for event in sorted_event_phrases if event[1] <= 5] event_rows = [] for row in rows:
from utils.utilities import Utilities from preprocessing.event_extractor import EventExtractor import time if __name__ == '__main__': # Original Code # # utilities = Utilities() # event_extraction = EventExtractor() # # # # event_extraction.save_texts_in_file() # # events = event_extraction.extract_events() # # for event in events: # if len(event['event_phrases']) > 0: # utilities.save_or_append_in_csv(event, 'events.csv') utilities = Utilities() event_extractor = EventExtractor() tweet_rows = event_extractor.get_unique_tweets() tweet_sentences = event_extractor.get_tweet_sentences(tweet_rows) events = event_extractor.extract_events2(tweet_sentences) events = sorted(events, key=lambda x: time.strptime(x[0], '%d-%m-%Y %H:%M')) utilities.save_or_append_list_as_csv(events, 'events2.csv')
def __init__(self): self.data_source_file = None self.utilities = Utilities()
class BackgroundDataCollection: def __init__(self): self.data_source_file = None self.utilities = Utilities() def set_data_source_file(self, source_file): self.data_source_file = source_file def remove_out_of_range_historic_urls(self, urls, date_from, date_to): """ Remove out of range urls :param urls: list of urls :param date_from: date from :param date_to: date to :return: list of in the range urls """ in_range_urls = [] try: date_from = parser.parse(str(date_from)) date_to = parser.parse(str(date_to)) except ValueError: raise Exception("Invalid date range. Please input date in yyyymmdd format") for url in urls: if len(url) > 43: date_str = url[28:42] url_time = parser.parse(date_str) if date_from <= url_time <= date_to: in_range_urls.append(url) return in_range_urls def collect_data(self, date_from, date_to): """ Run the whole workflow for historical article collection within a range :param date_from: date from :param date_to: date to :return: list of articles """ # os.makedirs(self.articles_base_dir, exist_ok=True) try: parser.parse(str(date_from)) parser.parse(str(date_to)) except ValueError: print("Invalid date format. Please provide date in yyyymmdd format.") return source_urls = self.utilities.read_lines_from_file(self.data_source_file) new_file_count = 0 for source_url in source_urls: url_str = str(subprocess.run( ['waybackpack', source_url, '--list', '--from-date', str(date_from), '--to-date', str(date_to)], stdout=subprocess.PIPE).stdout.decode('utf-8')) urls = url_str.splitlines() print(urls) exit()
def _OnWebSocketTextMsg(webSocket, msg): global Utilities, WifiHandler, Config import ujson print('WebSocket text message: %s' % msg) # webSocket.SendTextMessage('Received "%s"' % msg) try: params = ujson.loads(msg) if params["command"] == "identity": from utils.json_const import identity_result identity_result.update( hardware_version=Config.HARDWARE_VERSION, hardware_name=Config.HARDWARE_NAME, mac_address=WifiHandler.get_mac_address(), ip_address=WifiHandler.get_ip_address()) webSocket.SendTextMessage(ujson.dumps(identity_result)) elif params["command"] == "save_settings": from utils.json_const import save_settings_result_success, save_settings_result_failed from utils.settings_template import template settings = template.format(**params) # print(settings) with open("settings.py", "w") as file: length = file.write(settings) if length == len(settings): webSocket.SendTextMessage( ujson.dumps(save_settings_result_success)) else: webSocket.SendTextMessage( ujson.dumps(save_settings_result_failed)) elif params["command"] == "reboot_device": Utilities.hard_reset() elif params["command"] == "check_wifi": from utils.json_const import check_wifi_result result_code = WifiHandler.set_sta_mode(params["wifi_ssid"], params["wifi_password"], timeout_sec=60, for_test=True) check_wifi_result.update(result_code=result_code) webSocket.SendTextMessage(ujson.dumps(check_wifi_result)) if result_code == WifiHandler.STATION_CONNECTED: import urequests from utils.json_const import check_internet_result_success, check_internet_result_failed try: res = urequests.get(Config.INTERNET_TESTING_URL, timeout=10.0) if res: if res.text == "Success": webSocket.SendTextMessage( ujson.dumps(check_internet_result_success)) else: webSocket.SendTextMessage( ujson.dumps(check_internet_result_failed)) else: webSocket.SendTextMessage( ujson.dumps(check_internet_result_failed)) except Exception: webSocket.SendTextMessage( ujson.dumps(check_internet_result_failed)) elif params["command"] == "check_mqtt": from umqtt.simple import MQTTClient from utils.json_const import check_mqtt_result_success, check_mqtt_result_failed def sub_cb(topic, msg): pass mqtt_client = MQTTClient(params["client_id"], params["host"], int(params["port"]), params["username"], params["password"], int(params["keepalive"])) try: username = params["bigiot_username"] if bool( params["is_bigiot"]) else params["client_id"] mqtt_client.set_callback(sub_cb) print("check_mqtt_result:", mqtt_client.connect(True)) print( "test subscribe:", mqtt_client.subscribe( "{}/data".format(username).encode())) print( "test publish:", mqtt_client.publish( "{}/data".format(username).encode(), "world")) mqtt_client.disconnect() webSocket.SendTextMessage( ujson.dumps(check_mqtt_result_success)) except Exception as e: print(str(e)) # e == 5, authorized failed, means device number or device authorize wrong if str(e) == "5": check_mqtt_result_failed.update( error_code="5", error_msg= "Authorized failed, check Username and Password") # e == 128, sub failed, means client_id or topic auth(username/data) wrong elif str(e) == "128": check_mqtt_result_failed.update( error_code="128", error_msg= "Subscribe failed, check Bigiot Username and Client ID" ) else: check_mqtt_result_failed.update( error_code=str(e), error_msg="Unknown error: {}".format(str(e))) webSocket.SendTextMessage( ujson.dumps(check_mqtt_result_failed)) except ValueError: webSocket.SendTextMessage("Params Format Error") gc.collect()
from utils.utilities import Utilities from database.db_operations import DbOperations from required_files.config import app_config if __name__ == '__main__': db_operations = DbOperations() utilities = Utilities() items = db_operations.get_all_item() data_file = app_config['data_file'] for item in items: utilities.save_or_append_in_csv(item, data_file)
def __sub_cb(self, topic, msg): if topic != self._topic: return print("msg: {}".format(msg)) try: json_obj = json.loads(str(msg, "utf-8")) command = json_obj['command'] general_result = { 'command': command + '_result', 'mac_address': json_obj['mac_address'], 'result': 'success' } if command == "wake_up_pc": for count in range(3): wake_on_lan(json_obj['mac_address']) general_result['title'] = json_obj['title'] general_result['mac_address'] = WifiHandler.get_mac_address() self._client.publish(topic, json.dumps(general_result)) elif command == 'device_remove': if json_obj['mac_address'] != WifiHandler.get_mac_address(): return general_result['title'] = json_obj['title'] self._client.publish(topic, json.dumps(general_result)) Utilities.del_settings_file() Utilities.hard_reset() elif command == 'sync_datetime': if json_obj['mac_address'] != WifiHandler.get_mac_address(): return datetime = json_obj['datetime'] RTC().datetime(( datetime['year'], datetime['month'], datetime['day'], datetime['weekday'], # 0~6 datetime['hour'], datetime['minute'], datetime['second'], datetime['millisecond'])) self._client.publish(topic, json.dumps(general_result)) print("datetime: %02d-%02d-%02d %02d:%02d:%02d" % ((localtime()[:-2]))) elif command == 'device_reboot': if json_obj['mac_address'] != WifiHandler.get_mac_address(): return Utilities.hard_reset() elif command == 'report_error_log': if json_obj['mac_address'] != WifiHandler.get_mac_address(): return general_result['logs'] = Utilities.read_logs() self._client.publish(topic, json.dumps(general_result)) except ValueError: pass except KeyError as ke: print("KeyError:", ke) gc.collect()