コード例 #1
0
def ImportDB(workspace):

    importScript = "/Users/amitrou/Documents/CodeProjects/historical_data/restore-db.sh"
    if not os.path.exists(importScript):
        raise Exception("Script doesn't exist")
        return

    settingsPath = os.path.join(workspace, "import_settings.json")
    if not os.path.exists(settingsPath):
        raise Exception("Configuration file doen't exist")
        return

    import_settings = Utilities.ReadJSON(settingsPath)

    if import_settings == None:
        raise Exception("Error")

    server = import_settings['host']
    port = import_settings['port']
    user = import_settings['user']
    dump_list = import_settings["dump_list"]

    if len(dump_list) == 0:
        raise Exception("No contents")

    for item in dump_list:
        dump_file = item["dupm_file"]
        db_name = item["db_name"]

        if (dump_file != None) and (db_name != None):
            Utilities.ExecuteShellScript(importScript, server, port, user,
                                         db_name, dump_file)
コード例 #2
0
    def __heartbeat_cb(self, timer):
        self.__heartbeat_counter += 1

        if self.__heartbeat_counter >= Settings.MQTT_KEEPALIVE:
            try:
                self.__client.publish(
                    b'{}/ping'.format(Settings.MQTT_USERNAME), b'ping')
                self.__heartbeat_counter = 0
            except OSError as ose:
                err_msg = str(ose)

                print("err time:", time())
                print(err_msg)

                if err_msg in ("[Errno 104] ECONNRESET", "-1"):
                    try:
                        self.__client.disconnect()
                    except OSError:
                        pass
                    finally:
                        self.__client.connect()
                elif err_msg == "[Errno 113] EHOSTUNREACH":
                    Utilities.hard_reset()

        gc.collect()
コード例 #3
0
ファイル: event_extractor.py プロジェクト: hkayesh/causality
 def __init__(self):
     self.data_file = app_config['data_file']
     self.texts_in_file = 'texts_in_file.txt'
     self.ner_texts_file = 'output.txt'
     self.utilities = Utilities()
     self.lemmatizer = WordNetLemmatizer()
     self.preprocessor = Preprocessor(
         ['remove_urls', 'remove_mentions', 'remove_hashtags', 'normalize'])
コード例 #4
0
    def __init__(self):
        self.path_to_jar = 'lib/stanford_parser/stanford-parser.jar'
        self.path_to_models_jar = 'lib/stanford_parser/stanford-english-corenlp-2018-02-27-models.jar'
        self.path_to_ner_tagger = 'lib/stanford_ner/stanford-ner.jar'
        self.path_to_ner_model = 'lib/stanford_ner/english.all.3class.distsim.crf.ser.gz'

        self.ner_tagger = StanfordNERTagger(self.path_to_ner_model,
                                            self.path_to_ner_tagger)
        self.dependency_parser = StanfordDependencyParser(
            path_to_jar=self.path_to_jar,
            path_to_models_jar=self.path_to_models_jar)
        self.lemmatizer = WordNetLemmatizer()
        self.utilities = Utilities()
コード例 #5
0
    def getEHRClasses(patientXML, children=True, parents=True, duplicates=False):

        if ( duplicates ):

            ehrClasses = Utilities.getXMLElements(patientXML, {}, children, parents, duplicates);
            allValues = [];
            for depth in ehrClasses: allValues += ehrClasses[depth];
            return allValues;

        else:

            # Combines all values in dictionary of EHR depths.
            return [element.tag for element in set(set().union(*list(Utilities.getXMLElements(patientXML, {}, children, parents, duplicates).values())))];
コード例 #6
0
 def centroidError():
     centroid1 = Body25.getCentroid(keypoint1)
     centroid2 = Body25.getCentroid(keypoint2)
     dist = Ut.distance(centroid1 / frame, centroid2 / frame, ignore_zero_vector=True)
     if np.isnan(dist):
         return np.inf
     return dist
コード例 #7
0
    def getEHRClassChildren(patientXML, ehrClass, children=True, parents=False, allEHRChildren=False, contextualiseChildren=True, removeGerunds=True):

        ehrClassChildren = {};

        for ehrClassExample in patientXML.findall(".//" + ehrClass):

            ehrClassExampleDepthsToChildren = Utilities.getXMLElements(ehrClassExample, {}, children, parents, False, True, True);

            if 0 in list(ehrClassExampleDepthsToChildren.keys()):

                for element in ehrClassExampleDepthsToChildren[0]:

                    # Contextualise those EHR children that do not give enough context on their own, because they are just generic children.
                    if ( contextualiseChildren and element.tag.lower() in TranslationConstants.FIELDS_THAT_INDICATE_RESOURCE_CAN_HOLD_ANY_DATA ):

                        # Work out how to present this new compound child (child + parent name), based on which separators are used by this EHR.
                        if ( TranslationConstants.SEPARATOR != "" ):
                            element.tag = ehrClass + TranslationConstants.SEPARATOR + element.tag;

                        else:
                            element.tag = ehrClass[0].upper() + ehrClass[1:] + element.tag;

                    ehrClassChildren.setdefault(ehrClass, []).extend([element.tag]);

                    # If an EHR word begins with a gerund (such as 'Managing' in 'ManagingOrganisation'), this potentially complicates the context of the word, and so should be accounted for. Remove gerunds AND add the gerund free version as an additional EHR child.
                    if ( removeGerunds ): ehrClassChildren.setdefault(ehrClass, []).extend([TranslationUtilities.removeGerund(element.tag)]);


            # As we may have multiple examples of an EHR class in an example piece of marked up data from an EHR vendor, we want to find all possible examples of children that can be listed under that class.
            if ( not allEHRChildren ): break;

        return ehrClassChildren;
コード例 #8
0
    def getFHIRClassesToChildren(fhirClasses=TranslationUtilities.getFHIRClasses(), linkedClasses=True, fhirClassesRecurse=False, selectiveRecurse=TranslationConstants.SELECTIVE_RECURSE, includesBackboneElements=True, mergeMainChildrenWithBackboneChildren=True):

        fhirClassesToChildren = {};

        if (includesBackboneElements and mergeMainChildrenWithBackboneChildren):

            for fhirClassAndBackboneElements in fhirClasses:

                fhirClass = fhirClassAndBackboneElements[0];

                for fhirClassOrBackboneElement in fhirClassAndBackboneElements:

                    children = TranslationUtilities.getFHIRClassChildren(fhirClassOrBackboneElement, linkedClasses, fhirClassesRecurse, selectiveRecurse);

                    if ( children != None ): fhirClassesToChildren.setdefault(fhirClass, []).extend(children)

        else:

            if ( not mergeMainChildrenWithBackboneChildren ): fhirClasses = Utilities.mergeListOfLists(fhirClasses);

            for fhirClass in fhirClasses:

                children = TranslationUtilities.getFHIRClassChildren(fhirClass, linkedClasses, fhirClassesRecurse, selectiveRecurse);

                if ( children != None ): fhirClassesToChildren[fhirClass] = children;

        return fhirClassesToChildren;
コード例 #9
0
    def removeGerund(ehrChild):

        separatedElementTag = Utilities.listFromCapitals(ehrChild);

        if ( len(separatedElementTag) > 1 ):

            taggedSeperatedElementTag = nltk.pos_tag(separatedElementTag);
            separatedElementTag = [tag[0] for tag in taggedSeperatedElementTag if "VBG" not in tag[1]]
            return "".join(separatedElementTag);

        return ehrChild;
コード例 #10
0
    def morphologicalSimilarity(ehrAttribute, fhirAttribute, lemmaSimilarityThreshold=TranslationConstants.MORPHOLOGICAL_SIMILARITY_THRESHOLD):

        if SimilarityMetrics.textMatch(ehrAttribute, fhirAttribute): return 1;

        highestSimilarity = 0;

        for lemma in Utilities.lemmas(ehrAttribute):

            if SimilarityMetrics.textSimilarity(lemma, fhirAttribute, True) > highestSimilarity and SimilarityMetrics.textMatch(lemma, fhirAttribute, True, lemmaSimilarityThreshold):
                highestSimilarity = SimilarityMetrics.textSimilarity(lemma, fhirAttribute, True);

        return highestSimilarity;
コード例 #11
0
    def compositeStringSimilarity(ehrClassField, fhirClassField, comparisonMethod, comparisonMethodArgs=[], highestResult=True, removeStopwords=True):

        if ( comparisonMethod(ehrClassField, fhirClassField, *comparisonMethodArgs) == 1 ): return 1;

        # If ehrClass string is composite, compare each word with the FHIR target using all of the metrics, and then use chosen combination method to produce a value, e.g. for each word, add these values, and then divide by number of words to get an average match across all words or return highest.
        highestSimilarity = 0;
        highestSimilarityWord = "";

        totalSimilarity = 0;

        ehrWords = Utilities.listFromCapitals(ehrClassField);
        fhirWords = Utilities.listFromCapitals(fhirClassField);

        if (removeStopwords): ehrWords = [word for word in ehrWords if word.lower() not in stopwords.words('english')];

        for ehrWord in ehrWords:

            highestSimilarityForEHRWord = 0;

            for fhirWord in fhirWords:

                similarity = comparisonMethod(ehrWord, fhirWord, *comparisonMethodArgs);

                if ( similarity > highestSimilarity ):

                    highestSimilarity = similarity;
                    highestSimilarityWord = ehrWord;

                if ( similarity > highestSimilarityForEHRWord ): highestSimilarityForEHRWord = similarity;

            totalSimilarity += highestSimilarityForEHRWord;

        if ( highestResult and len(highestSimilarityWord) > TranslationConstants.LENGTH_TO_IGNORE_IN_COMPOSITE_HIGHEST ):

            return highestSimilarity;

        else:

            return old_div(totalSimilarity, max(float(len(ehrWords)), float(len(fhirWords))));
コード例 #12
0
ファイル: preprocesssor.py プロジェクト: hkayesh/causality
 def __init__(self, params=list()):
     self.remove_urls = True if 'remove_urls' in params else False
     self.remove_mentions = True if 'remove_mentions' in params else False
     self.remove_hashtags = True if 'remove_hashtags' in params else False
     self.normalize = True if 'normalize' in params else False
     self.remove_stopwords = True if 'remove_stopwords' in params else False
     self.remove_punct = True if 'remove_punctuation' in params else False
     self.lower = True if 'lower' in params else False
     self.lemmatize = True if 'lemmatize' in params else False
     self.stemming = True if 'stemming' in params else False
     self.remove_non_letters = True if 'remove_non_letters' in params else False
     self.lemmatizer = WordNetLemmatizer()
     self.stemmer = PorterStemmer()
     self.utilities = Utilities()
コード例 #13
0
    def get_evaluation_data(self, dataset_file, n_pair):
        utilities = Utilities()

        preprocessor = Preprocessor(
            ['remove_stopwords', 'remove_non_letters', 'lemmatize'])

        data_rows = utilities.read_from_csv(dataset_file)
        del data_rows[0]

        X = []
        y = []
        for data_row in data_rows[:n_pair]:
            candidate_causal_pair = eval(data_row[2])
            label = 1 if data_row[3] == 'causal' else 0

            candidate_causal_phrase = preprocessor.preprocess(
                candidate_causal_pair[0])
            candidate_effect_phrase = preprocessor.preprocess(
                candidate_causal_pair[1])
            if len(candidate_causal_phrase) > 0 and len(
                    candidate_effect_phrase) > 0:
                X.append((candidate_causal_pair[0], candidate_causal_pair[1]))
                y.append(label)
        return X, y
コード例 #14
0
 def distanceError():
     sum_dist = 0
     num = 0
     for part in Body25.Parts:
         coord1 = Body25.getCoordinates(keypoint1, part)
         coord2 = Body25.getCoordinates(keypoint2, part)
         dist = Ut.distance(coord1 / frame, coord2 / frame, ignore_zero_vector=True)
         if np.isnan(dist):
             continue
         else:
             sum_dist += dist
         num += 1
     if num == 0:
         return np.inf
     else:
         return sum_dist / num
コード例 #15
0
    def test_TestMorphological(self):

        # e.g. self.assertTrue(Matches.match("PostCode", "postalCode"));

        total = 0
        matched = 0
        for key, value in usToGB.items():
            lemmas = list(Utilities.lemmas(value))
            if (lemmas):
                total += 1
                shuffle(lemmas)
                if (Matches.matches(value, lemmas[0])):
                    matched += 1
                else:
                    print(str(value) + " " + str(lemmas[0]))
        matchPercentage = matched / float(total)
        self.assertTrue(matchPercentage > 0.90)
コード例 #16
0
    def run_script(browser='chrome'):
        try:
            driver = Utilities.create_webdriver_instance(browser=browser)
            driver.get(WEBSITE_URL)

            TaskFour.select_random_video_on_homepage(driver)
            TaskFour.wait_for_ad_to_complete(driver)
            TaskFour.move_progress_bar_to_some_position(driver)

        except Exception:
            # TODO: Remove broad exceptions
            # TODO: Add logging
            # TODO: Take screenshot
            traceback.print_exc()

        finally:
            driver.quit()
            print('Script Complete')
コード例 #17
0
    def run_script(browser='chrome'):
        try:
            driver = Utilities.create_webdriver_instance(browser=browser)

            TaskTwo.search_for_flights(driver)
            TaskTwo.wait_for_flight_results_page_to_load(driver)
            TaskTwo.select_first_view_deal(driver)
            TaskTwo.select_second_view_deal(driver)

        except Exception:
            # TODO: Remove broad exceptions
            # TODO: Add logging
            # TODO: Take screenshot
            traceback.print_exc()

        finally:
            driver.quit()
            print('Script Complete')
コード例 #18
0
    def __data_timer_cb(self, timer):
        value = self.get_temperature()

        print("current temperature: {} ℃".format(value))

        try:
            self.__publish_data(value)
        except OSError as ose:
            err_msg = str(ose)

            if err_msg == "-1":
                pass
            elif err_msg == "[Errno 113] EHOSTUNREACH":
                Utilities.hard_reset()
            else:
                Utilities.log(self.__data_timer_cb, err_msg,
                              self.__log_callback)
        except Exception as e:
            err_msg = str(e)
            Utilities.log(self.__data_timer_cb, err_msg, self.__log_callback)
コード例 #19
0
    def __msg_timer_cb(self):
        while self.__starting:
            try:
                self.__mqtt_client.wait_msg()
            except OSError as ose:
                err_msg = str(ose)

                if err_msg == "-1":
                    pass
                elif err_msg == "[Errno 113] EHOSTUNREACH":
                    Utilities.hard_reset()
                else:
                    Utilities.log(self.__msg_timer_cb, err_msg,
                                  self.__log_callback)
                    # raise OSError(err_msg)
            except Exception as e:
                err_msg = str(e)
                Utilities.log(self.__msg_timer_cb, err_msg,
                              self.__log_callback)

            gc.collect()
コード例 #20
0
ファイル: main_causal_net.py プロジェクト: hkayesh/causality
def do_job(articles, tokens, causal_net_generator):
    causal_pair_tokens = causal_net_generator.get_all_causal_pair_tokens(
        articles)

    tokens += causal_pair_tokens


if __name__ == '__main__':
    start = time.time()
    print("\nJob started at %s" %
          datetime.fromtimestamp(start).strftime('%Y-%m-%d %H:%M:%S'))
    causal_net_generator = CausalNetGenerator()
    causal_net_generator_from_news = CausalNetGeneratorFromNews()
    multi_word_causal_net_generator_from_news = MultiWordCausalNetGeneratorFromNews(
    )
    utilities = Utilities()
    manager = Manager()

    ## Generate causal net from wikipedia articles

    tokens = manager.list()
    num_threads = cpu_count() - 1
    number = 1000000
    offset = 0
    print("Number: %d and offset %d" % (number, offset))

    graph_path = 'causal_net.pickle'

    articles = causal_net_generator.get_articles(number=number, offset=offset)
    dispatch_jobs(articles, num_threads, tokens, causal_net_generator)
コード例 #21
0
    def semanticSimilarity(ehrAttribute, fhirAttribute, useDefinition=False, alsoUseMorphologicalSimilarity=False, morphologicalSimilarityThreshold=TranslationConstants.MORPHOLOGICAL_SIMILARITY_THRESHOLD, compositeSynonyms=False, highestResult=True ):

        # If these attributes would be associated via a text match instead, then don't also reevaluate their similarity via the text similarity below.
        if SimilarityMetrics.textMatch(ehrAttribute, fhirAttribute, False): return 0;

        highestSimilarity = 0;

        # wordnet requires word separation by underscore, whereas EHR XML responses (for TPP at least) use camelCase (this won't be an issue if used with composite string similarity, where only one word is used at a time).
        for set in wordnet.synsets(Utilities.capitalToSeparation(ehrAttribute)):

            synonyms = set.lemma_names();

            if useDefinition:

                setType = set.pos();
                associatedSynonyms = [];

                if ( set not in SimilarityMetrics.synsetToDefinitionTerms ):

                    # We also include words from the definition of this word, that are of the same grammatical type (e.g. noun or verb), as potential synonyms.
                    for word in set.definition().split(" "):

                        if ( len(word) <= 3 or word in associatedSynonyms or "." in word ): continue;

                        if ( word not in SimilarityMetrics.wordsToTypes ):

                            wordSynset = wordnet.synsets(word);

                            if not len(wordSynset): continue;

                            # Find most popular interpretation of this word, so can find right grammatical form.
                            chosenSynset = wordSynset[0];
                            highestLemmaPopularity = 0;

                            for set in wordSynset:

                                for lemma in set.lemmas():

                                    if lemma.count() > highestLemmaPopularity:
                                        highestLemmaPopularity = lemma.count();
                                        chosenSynset = set;

                            SimilarityMetrics.wordsToTypes[word] = chosenSynset.pos();

                        if ( SimilarityMetrics.wordsToTypes[word] == setType ):

                            associatedSynonyms.append(word);

                    SimilarityMetrics.synsetToDefinitionTerms[set] = associatedSynonyms;

                synonyms = synonyms + SimilarityMetrics.synsetToDefinitionTerms[set];

            for synonym in synonyms:

                # Do we want the highest value across all components of the synonym, or just the synonym directy.
                if ( compositeSynonyms ):

                    textSimilarity = SimilarityMetrics.compositeStringSimilarity(Utilities.separationToCapital(synonym), fhirAttribute, SimilarityMetrics.textSimilarity, [], highestResult);

                else:

                    textSimilarity = SimilarityMetrics.textSimilarity(Utilities.separationToCapital(synonym), fhirAttribute);

                # Synonyms may also be grammatical variants as opposed to just text matches.
                if ( alsoUseMorphologicalSimilarity ):

                    if ( compositeSynonyms ):

                        morphologicalSimilarity = SimilarityMetrics.compositeStringSimilarity(Utilities.separationToCapital(synonym), fhirAttribute, SimilarityMetrics.morphologicalSimilarity, [morphologicalSimilarityThreshold], highestResult);

                    else:

                        morphologicalSimilarity = SimilarityMetrics.morphologicalSimilarity(synoynm, fhirAttribute);

                else:
                    morphologicalSimilarity = 0;

                # Get similarity between synonym for ehrAttribute and fhirAttribute (not synonyms that are the ehr attribute itself). If this is over a given threshold, AND it is greater than previously marked highest values, update highest similarity.
                if not SimilarityMetrics.textSimilarity(synonym, ehrAttribute) == 1.0 and max(textSimilarity, morphologicalSimilarity) > highestSimilarity:

                    highestSimilarity = max(textSimilarity, morphologicalSimilarity);

        return highestSimilarity;
コード例 #22
0
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score
from nltk.corpus import wordnet

from utils.utilities import Utilities
from causality_detection.causal_stength_calculator import CausalStrengthCalculator
from preprocessing.preprocesssor import Preprocessor

if __name__ == '__main__':
    causal_strength_calculator = CausalStrengthCalculator()

    utilities = Utilities()
    preprocessor = Preprocessor(
        ['remove_stopwords', 'remove_non_letters', 'lemmatize'])
    dataset_file = 'causal_pairs_dataset_old.csv'

    data_rows = utilities.read_from_csv(dataset_file)
    del data_rows[0]
    X = []
    y_true = []
    y_pred = []
    threshold = 10

    for data_row in data_rows[:10]:
        candidate_causal_pair = eval(data_row[2])
        label = 1 if data_row[3] == 'causal' else 0

        candidate_causal_phrase = preprocessor.preprocess(
            candidate_causal_pair[0])
        candidate_effect_phrase = preprocessor.preprocess(
            candidate_causal_pair[1])
        if len(candidate_causal_phrase) > 0 and len(
コード例 #23
0
ファイル: event_extractor.py プロジェクト: hkayesh/causality
class EventExtractor:
    def __init__(self):
        self.data_file = app_config['data_file']
        self.texts_in_file = 'texts_in_file.txt'
        self.ner_texts_file = 'output.txt'
        self.utilities = Utilities()
        self.lemmatizer = WordNetLemmatizer()
        self.preprocessor = Preprocessor(
            ['remove_urls', 'remove_mentions', 'remove_hashtags', 'normalize'])

        # jar_files = os.path.join(os.path.dirname(__file__), 'jars')
        # self.sutime = SUTime(jars=jar_files, mark_time_ranges=True)

    def save_texts_in_file(self):
        items = self.utilities.read_from_csv(self.data_file)

        header = items[0]

        texts = [item[header.index('text')] for item in items[1:]]

        processed_texts = [
            self.preprocessor.preprocess(text).encode('utf8') for text in texts
        ]

        self.utilities.save_list_as_text_file(processed_texts,
                                              self.texts_in_file)

    def prepare_phrases(self,
                        matches,
                        tag,
                        token_position=0,
                        tag_position=-1,
                        splitter='/'):
        phrases = []
        phrase = ''
        for match in matches:
            match_components = match.split(splitter)
            text_token = match_components[token_position].lower().strip()
            event_tag = match_components[tag_position]

            if event_tag == 'B-' + tag and len(phrase) < 1:
                phrase += text_token
            elif event_tag == 'B-' + tag and len(phrase) > 0:
                phrases.append(phrase)
                phrase = text_token
            else:
                phrase += ' ' + text_token
        phrases.append(phrase)
        phrases = list(set(phrases))

        return phrases

    def get_event_phrases(self, text):
        tag_name = 'EVENT'
        matches = re.findall(r'\w+/O/[A-Z]+/[BI]-' + tag_name, text)
        phrases = self.prepare_phrases(matches, tag_name)
        joined_text = ', '.join(phrases) if len(phrases) > 0 else ''

        return joined_text

    def get_event_locations(self, text):
        tag_name = 'geo-loc'
        matches = re.findall(r'\w+/[BI]-' + tag_name + '/[A-Z]+/O', text)
        phrases = self.prepare_phrases(matches=matches,
                                       tag=tag_name,
                                       token_position=0,
                                       tag_position=1)
        joined_text = ', '.join(phrases) if len(phrases) > 0 else ''

        return joined_text

    def get_event_entities(self, text):
        tag_names = [
            'person', 'company', 'facility', 'product', 'band', 'sportsteam',
            'movie', 'tv-show'
        ]
        phrases = []
        for tag_name in tag_names:
            matches = re.findall(r'\w+/[BI]-' + tag_name + '/[A-Z]+/O', text)
            if len(matches) > 0:
                phrases += self.prepare_phrases(matches=matches,
                                                tag=tag_name,
                                                token_position=0,
                                                tag_position=1)

        joined_text = ', '.join(phrases) if len(phrases) > 0 else ''

        return joined_text

    def extract_events(self):
        data_rows = self.utilities.read_from_csv(self.data_file)
        text_rows = self.utilities.read_lines_from_file(self.ner_texts_file)

        header = data_rows[0]
        del data_rows[0]
        events = []
        unique_texts = []
        for data_row, text_row in zip(data_rows, text_rows):
            text = self.preprocessor.preprocess(data_row[header.index('text')])

            if text in unique_texts:
                continue

            event = {
                'tweet_id': data_row[header.index('id')],
                'entities': self.get_event_entities(text_row),
                'locations': self.get_event_locations(text_row),
                'event_time': data_row[header.index('created_at')],
                'event_phrases': self.get_event_phrases(text_row),
            }

            events.append(event)
            unique_texts.append(text)

        return events

    def extract_events_from_stanford_dependencies(self, dependencies,
                                                  ner_tags):
        entity_categories = ['PERSON', 'LOCATION', 'ORGANIZATION']
        raw_events = {}
        for dependency in dependencies:
            if len(dependency) == 3:
                head = dependency[0]
                relation = dependency[1]
                tail = dependency[2]

                if head[1].startswith('VB'):
                    event_keywords = list(raw_events.keys())
                    event_keyword = self.lemmatizer.lemmatize(
                        head[0].lower(), 'v')
                    if event_keyword not in event_keywords:
                        raw_events[event_keyword] = {}

                    if relation.endswith('subj'):
                        subject_pronoun = [
                            'i', 'you', 'he', 'she', 'we', 'they', 'who'
                        ]
                        subj_value = self.lemmatizer.lemmatize(tail[0].lower())

                        if tail[0].lower() in subject_pronoun:
                            subj_value = 'PERSON'
                        else:
                            for ner_tag in ner_tags:
                                if ner_tag[0] == tail[0] and ner_tag[
                                        1] in entity_categories:
                                    subj_value = ner_tag[1]
                        raw_events[event_keyword]['subj'] = subj_value

                    if relation == 'dobj':
                        objective_pronoun = [
                            'me', 'you', 'him', 'her', 'us', 'you', 'them'
                        ]
                        dobj_value = self.lemmatizer.lemmatize(tail[0].lower())

                        if tail[0].lower() in objective_pronoun:
                            dobj_value = 'PERSON'
                        else:
                            for ner_tag in ner_tags:
                                if ner_tag[0] == tail[0] and ner_tag[
                                        1] in entity_categories:
                                    dobj_value = ner_tag[1]

                        raw_events[event_keyword]['dobj'] = dobj_value

                    if relation == 'compound:prt':
                        raw_events[event_keyword]['prt'] = tail[0]

        events = []
        for verb in list(raw_events.keys()):
            event = raw_events[verb]
            if len(verb) < 2 or 'subj' not in list(event.keys()) or len(event['subj']) < 2 \
                    or 'dobj' not in list(event.keys()) or len(event['dobj']) < 2:
                continue

            event['keyword'] = verb
            events.append(event)

        return events

    def get_unique_tweets(self, n_rows=None):
        data_rows = self.utilities.read_from_csv(self.data_file)
        preprocessor = Preprocessor([
            'remove_urls', 'remove_mentions', 'remove_hashtags', 'normalize',
            'remove_non_letters'
        ])

        header = data_rows[0]
        del data_rows[0]
        tweet_rows = {}
        for data_row in data_rows:
            if n_rows is not None and len(tweet_rows) >= n_rows:
                break
            tweet = preprocessor.preprocess(data_row[header.index('text')])
            if tweet not in list(tweet_rows.keys()):
                tweet_rows[tweet] = data_row
        tweet_rows = [header] + list(tweet_rows.values())

        return tweet_rows

    def get_tweet_sentences(self, tweet_rows):
        header = tweet_rows[0]
        del tweet_rows[0]

        tweet_sentences = []
        for tweet_row in tweet_rows:
            created_at = tweet_row[header.index('created_at')]
            text = self.preprocessor.preprocess(
                tweet_row[header.index('text')])
            sentences = sent_tokenize(text)
            for sentence in sentences:
                if len(sentence) > 1:
                    tweet_sentences.append((created_at, sentence))

        return tweet_sentences

    def extract_events2(self, tweet_sentences):
        path_to_jar = 'lib/stanford_parser/stanford-parser.jar'
        path_to_models_jar = 'lib/stanford_parser/stanford-english-corenlp-2018-02-27-models.jar'
        path_to_ner_tagger = 'lib/stanford_ner/stanford-ner.jar'
        path_to_ner_model = 'lib/stanford_ner/english.all.3class.distsim.crf.ser.gz'

        sentence_preprocessor = Preprocessor(['remove_non_letters'])
        ner_tagger = StanfordNERTagger(path_to_ner_model, path_to_ner_tagger)
        dependency_parser = StanfordDependencyParser(
            path_to_jar=path_to_jar, path_to_models_jar=path_to_models_jar)

        events = []

        chunks = list(
            self.utilities.chunkify_list(data_list=tweet_sentences,
                                         items_per_chunk=1000))

        for chunk in chunks:
            created_ats = []
            sentences = []
            for chunk_item in chunk:
                created_ats.append(chunk_item[0])
                sentences.append(
                    sentence_preprocessor.preprocess(chunk_item[1]))

            chunk_sent_dependencies = dependency_parser.raw_parse_sents(
                sentences)
            chunk_sent_ner_tags = ner_tagger.tag_sents(
                [sentence.split() for sentence in sentences])

            for sent_dependencies, sent_ner_tags, created_at in zip(
                    chunk_sent_dependencies, chunk_sent_ner_tags, created_ats):
                dependencies = [
                    list(parse.triples()) for parse in sent_dependencies
                ]

                if len(dependencies) > 0 and dependencies[0] is not None:
                    sentence_events = self.extract_events_from_stanford_dependencies(
                        dependencies[0], sent_ner_tags)
                    if len(sentence_events) > 0:
                        for sentence_event in sentence_events:
                            events.append((created_at, sentence_event))

        return events

    def chunkify_events_by_timeslots(self, events, duration):
        slot_starts_at = None
        event_chunks = []
        event_chunk = []
        for event in events:
            created_at = datetime.strptime(event[0], '%d-%m-%Y %H:%M')

            if slot_starts_at is None:
                slot_starts_at = created_at

            if len(event_chunk
                   ) > 0 and created_at > slot_starts_at + timedelta(
                       0, duration):
                event_chunks.append(event_chunk)
                event_chunk = []
                slot_starts_at = created_at
            event_chunk.append(event)
        event_chunks.append(event_chunk)
        return event_chunks
コード例 #24
0
import timeit
import collections

from utils.utilities import Utilities
from preprocessing.preprocesssor import Preprocessor
from causality_detection.causal_stength_calculator import CausalStrengthCalculator
from causality_detection.itemsest_causality import ItemsetCausality


if __name__ == "__main__":
    start_time = timeit.default_timer()
    event_file_path = 'events.csv'
    utilities = Utilities()
    causal_strength_calculator = CausalStrengthCalculator()
    itemset_causality = ItemsetCausality()
    preprocessor = Preprocessor(params=['lower', 'lemmatize'])

    rows = utilities.read_from_csv(event_file_path)
    header = rows[0]
    del rows[0]

    events_phrases = []
    for row in rows:
        phrases = [phrase.strip() for phrase in row[header.index('event_phrases')].split(',')]
        events_phrases += phrases

    sorted_event_phrases = collections.Counter(events_phrases).most_common()
    low_freq_events = [event[0] for event in sorted_event_phrases if event[1] <= 5]

    event_rows = []
    for row in rows:
コード例 #25
0
ファイル: main_event.py プロジェクト: hkayesh/causality
from utils.utilities import Utilities
from preprocessing.event_extractor import EventExtractor
import time

if __name__ == '__main__':
    # Original Code
    #
    # utilities = Utilities()
    # event_extraction = EventExtractor()
    # #
    # # event_extraction.save_texts_in_file()
    #
    # events = event_extraction.extract_events()
    #
    # for event in events:
    #     if len(event['event_phrases']) > 0:
    #         utilities.save_or_append_in_csv(event, 'events.csv')

    utilities = Utilities()
    event_extractor = EventExtractor()

    tweet_rows = event_extractor.get_unique_tweets()

    tweet_sentences = event_extractor.get_tweet_sentences(tweet_rows)

    events = event_extractor.extract_events2(tweet_sentences)
    events = sorted(events,
                    key=lambda x: time.strptime(x[0], '%d-%m-%Y %H:%M'))
    utilities.save_or_append_list_as_csv(events, 'events2.csv')
コード例 #26
0
 def __init__(self):
     self.data_source_file = None
     self.utilities = Utilities()
コード例 #27
0
class BackgroundDataCollection:

    def __init__(self):
        self.data_source_file = None
        self.utilities = Utilities()

    def set_data_source_file(self, source_file):
        self.data_source_file = source_file

    def remove_out_of_range_historic_urls(self, urls, date_from, date_to):
        """
        Remove out of range urls

        :param urls: list of urls
        :param date_from: date from
        :param date_to: date to
        :return: list of in the range urls
        """
        in_range_urls = []

        try:
            date_from = parser.parse(str(date_from))
            date_to = parser.parse(str(date_to))
        except ValueError:
            raise Exception("Invalid date range. Please input date in yyyymmdd format")

        for url in urls:
            if len(url) > 43:
                date_str = url[28:42]

                url_time = parser.parse(date_str)

                if date_from <= url_time <= date_to:
                    in_range_urls.append(url)

        return in_range_urls

    def collect_data(self, date_from, date_to):
        """
        Run the whole workflow for historical article collection within a range

        :param date_from: date from
        :param date_to: date to
        :return: list of articles
        """

        # os.makedirs(self.articles_base_dir, exist_ok=True)
        try:
            parser.parse(str(date_from))
            parser.parse(str(date_to))
        except ValueError:
            print("Invalid date format. Please provide date in yyyymmdd format.")
            return

        source_urls = self.utilities.read_lines_from_file(self.data_source_file)
        new_file_count = 0
        for source_url in source_urls:

            url_str = str(subprocess.run(
                ['waybackpack', source_url, '--list', '--from-date', str(date_from), '--to-date', str(date_to)],
                stdout=subprocess.PIPE).stdout.decode('utf-8'))
            urls = url_str.splitlines()

            print(urls)
            exit()
コード例 #28
0
    def _OnWebSocketTextMsg(webSocket, msg):
        global Utilities, WifiHandler, Config
        import ujson

        print('WebSocket text message: %s' % msg)
        # webSocket.SendTextMessage('Received "%s"' % msg)

        try:
            params = ujson.loads(msg)

            if params["command"] == "identity":
                from utils.json_const import identity_result

                identity_result.update(
                    hardware_version=Config.HARDWARE_VERSION,
                    hardware_name=Config.HARDWARE_NAME,
                    mac_address=WifiHandler.get_mac_address(),
                    ip_address=WifiHandler.get_ip_address())

                webSocket.SendTextMessage(ujson.dumps(identity_result))
            elif params["command"] == "save_settings":
                from utils.json_const import save_settings_result_success, save_settings_result_failed
                from utils.settings_template import template

                settings = template.format(**params)
                # print(settings)
                with open("settings.py", "w") as file:
                    length = file.write(settings)

                    if length == len(settings):
                        webSocket.SendTextMessage(
                            ujson.dumps(save_settings_result_success))
                    else:
                        webSocket.SendTextMessage(
                            ujson.dumps(save_settings_result_failed))
            elif params["command"] == "reboot_device":
                Utilities.hard_reset()
            elif params["command"] == "check_wifi":
                from utils.json_const import check_wifi_result

                result_code = WifiHandler.set_sta_mode(params["wifi_ssid"],
                                                       params["wifi_password"],
                                                       timeout_sec=60,
                                                       for_test=True)

                check_wifi_result.update(result_code=result_code)

                webSocket.SendTextMessage(ujson.dumps(check_wifi_result))

                if result_code == WifiHandler.STATION_CONNECTED:
                    import urequests
                    from utils.json_const import check_internet_result_success, check_internet_result_failed

                    try:
                        res = urequests.get(Config.INTERNET_TESTING_URL,
                                            timeout=10.0)

                        if res:
                            if res.text == "Success":
                                webSocket.SendTextMessage(
                                    ujson.dumps(check_internet_result_success))
                            else:
                                webSocket.SendTextMessage(
                                    ujson.dumps(check_internet_result_failed))
                        else:
                            webSocket.SendTextMessage(
                                ujson.dumps(check_internet_result_failed))
                    except Exception:
                        webSocket.SendTextMessage(
                            ujson.dumps(check_internet_result_failed))
            elif params["command"] == "check_mqtt":
                from umqtt.simple import MQTTClient
                from utils.json_const import check_mqtt_result_success, check_mqtt_result_failed

                def sub_cb(topic, msg):
                    pass

                mqtt_client = MQTTClient(params["client_id"], params["host"],
                                         int(params["port"]),
                                         params["username"],
                                         params["password"],
                                         int(params["keepalive"]))

                try:
                    username = params["bigiot_username"] if bool(
                        params["is_bigiot"]) else params["client_id"]

                    mqtt_client.set_callback(sub_cb)
                    print("check_mqtt_result:", mqtt_client.connect(True))
                    print(
                        "test subscribe:",
                        mqtt_client.subscribe(
                            "{}/data".format(username).encode()))
                    print(
                        "test publish:",
                        mqtt_client.publish(
                            "{}/data".format(username).encode(), "world"))
                    mqtt_client.disconnect()

                    webSocket.SendTextMessage(
                        ujson.dumps(check_mqtt_result_success))
                except Exception as e:
                    print(str(e))

                    # e == 5, authorized failed, means device number or device authorize wrong
                    if str(e) == "5":
                        check_mqtt_result_failed.update(
                            error_code="5",
                            error_msg=
                            "Authorized failed, check Username and Password")
                    # e == 128, sub failed, means client_id or topic auth(username/data) wrong
                    elif str(e) == "128":
                        check_mqtt_result_failed.update(
                            error_code="128",
                            error_msg=
                            "Subscribe failed, check Bigiot Username and Client ID"
                        )
                    else:
                        check_mqtt_result_failed.update(
                            error_code=str(e),
                            error_msg="Unknown error: {}".format(str(e)))

                    webSocket.SendTextMessage(
                        ujson.dumps(check_mqtt_result_failed))
        except ValueError:
            webSocket.SendTextMessage("Params Format Error")

        gc.collect()
コード例 #29
0
from utils.utilities import Utilities
from database.db_operations import DbOperations
from required_files.config import app_config

if __name__ == '__main__':

    db_operations = DbOperations()
    utilities = Utilities()

    items = db_operations.get_all_item()
    data_file = app_config['data_file']
    for item in items:
        utilities.save_or_append_in_csv(item, data_file)
コード例 #30
0
    def __sub_cb(self, topic, msg):
        if topic != self._topic:
            return

        print("msg: {}".format(msg))

        try:
            json_obj = json.loads(str(msg, "utf-8"))
            command = json_obj['command']
            general_result = {
                'command': command + '_result',
                'mac_address': json_obj['mac_address'],
                'result': 'success'
            }

            if command == "wake_up_pc":
                for count in range(3):
                    wake_on_lan(json_obj['mac_address'])

                general_result['title'] = json_obj['title']
                general_result['mac_address'] = WifiHandler.get_mac_address()
                self._client.publish(topic, json.dumps(general_result))
            elif command == 'device_remove':
                if json_obj['mac_address'] != WifiHandler.get_mac_address():
                    return

                general_result['title'] = json_obj['title']
                self._client.publish(topic, json.dumps(general_result))

                Utilities.del_settings_file()
                Utilities.hard_reset()
            elif command == 'sync_datetime':
                if json_obj['mac_address'] != WifiHandler.get_mac_address():
                    return

                datetime = json_obj['datetime']
                RTC().datetime((
                    datetime['year'],
                    datetime['month'],
                    datetime['day'],
                    datetime['weekday'],  # 0~6
                    datetime['hour'],
                    datetime['minute'],
                    datetime['second'],
                    datetime['millisecond']))

                self._client.publish(topic, json.dumps(general_result))

                print("datetime: %02d-%02d-%02d %02d:%02d:%02d" %
                      ((localtime()[:-2])))
            elif command == 'device_reboot':
                if json_obj['mac_address'] != WifiHandler.get_mac_address():
                    return

                Utilities.hard_reset()
            elif command == 'report_error_log':
                if json_obj['mac_address'] != WifiHandler.get_mac_address():
                    return

                general_result['logs'] = Utilities.read_logs()
                self._client.publish(topic, json.dumps(general_result))
        except ValueError:
            pass
        except KeyError as ke:
            print("KeyError:", ke)

        gc.collect()