def test_ingest_ok(self, mock_writeToElasticsearch):
        test_event = {"datasource": "ntl"}
        test_config = {
            "data-sources": {
                "ntl": {
                    "type": "ntl",
                    "url": self.mock_dataset_url
                }
            }
        }

        ingest.makeQueryCall = MagicMock(
            return_value=UtilsTest().get_ntl_mock_data())

        mock_formatter_factory = FormatterFactory()
        mock_formatter_factory.get_formatter = MagicMock(
            return_value=NTLDataFormatter())

        mock_formatter = NTLDataFormatter()
        mock_formatter.get_data_objects = MagicMock()

        mock_slack_notifier = SlackNotifier(None, None)
        mock_slack_notifier.sendSlackNotification = MagicMock()

        ingest.ingest(test_event, test_config)

        mock_writeToElasticsearch.assert_called_once()
Beispiel #2
0
    def daily(self):
        print("Running Daily Job")
        #the ingest function sorts and moves files by date into the working/media directory
        ingest.ingest(ingestdir, workingdir)

        #the crawl function performs a hash index of all files in the target directories
        workingdirsum = crawl.crawl(True, workingdir, jsondatadir)
        archivedirsum = crawl.crawl(False, archivedir, jsondatadir)

        #the dedupe function combines all hash indexes and analyzes the dataset for duplicates
        data_files = glob.glob(jsondatadir + '/*.json')
        #run the dedupe function
        dedupe.dedupe(data_files, duplicatedir)
    def test_ingest_error_on_es(self, mock_sendSlackNotification,
                                mock_writeToElasticsearch):
        test_event = {"datasource": "ntl"}
        test_config = {
            "data-sources": {
                "ntl": {
                    "type": "ntl",
                    "url": self.mock_dataset_url
                }
            }
        }

        ingest.makeQueryCall = MagicMock(
            return_value=UtilsTest().get_ntl_mock_data())

        mock_writeToElasticsearch.side_effect = Exception("Test Exception")

        ingest.ingest(test_event, test_config)

        mock_writeToElasticsearch.assert_called_once()
        mock_sendSlackNotification.assert_called_once()
    def test_ingest_invalid_formatter(self, mock_sendSlackNotification,
                                      mock_get_formatter):
        test_event = {"datasource": "ntl"}
        test_config = {
            "data-sources": {
                "ntl": {
                    "type": "ntl",
                    "url": self.mock_dataset_url
                }
            }
        }

        ingest.makeQueryCall = MagicMock(
            return_value=UtilsTest().get_ntl_mock_data())

        mock_get_formatter.return_value = None

        ingest.ingest(test_event, test_config)

        mock_get_formatter.assert_called_once()
        mock_sendSlackNotification.assert_called_once()
Beispiel #5
0
def run(data_dir):
    """
    Run the pipeline, intermediate files go into
    data/extracted, data/parsed, and data/standardized
    which is ingested into ./expenses.db
    """

    cores = mp.cpu_count()
    pool = mp.Pool(cores)
    jobs = []

    raw_dir = os.path.join(data_dir, "raw")
    extracted_dir = os.path.join(data_dir, "extracted")
    parsed_dir = os.path.join(data_dir, "parsed")
    standardized_dir = os.path.join(data_dir, "standardized")

    if len(os.listdir(raw_dir)) == 0:
        return False

    make_dirs([extracted_dir, parsed_dir, standardized_dir])

    with tempfile.TemporaryDirectory() as tmp_standardized_dir:
        for raw, extracted, parsed, standardized in get_pipeline_files(
                raw_dir, extracted_dir, parsed_dir, tmp_standardized_dir):
            jobs.append(
                pool.apply_async(_etl, (raw, extracted, parsed, standardized)))

        [job.get() for job in jobs]
        # TODO: hardcoded expenses tablename and expenses.db
        ingest(
            get_files(tmp_standardized_dir),
            "expenses",
            os.path.join(data_dir, "expenses.db"),
        )
        for file_ in os.listdir(tmp_standardized_dir):
            os.replace(
                os.path.join(tmp_standardized_dir, file_),
                os.path.join(standardized_dir, file_),
            )
    return True
Beispiel #6
0
def trigger_ingest():
    url = request.args.get("url")
    table_name = ingest(url)

    with psycopg2.connect("host=localhost") as conn:
        with conn.cursor() as cur:
            cur.execute(
                "select column_name from information_schema.columns where table_name = %s and column_name != 'geom'",
                [table_name])
            columns = [row[0] for row in cur.fetchall()]

    for column_name in columns:
        annotate.suggest_concept(table_name, column_name)

    return jsonify({"status": "ok"})
Beispiel #7
0
    def test(self):
        print("Running Full Test Sequence")
        #the ingest function sorts and moves files by date into the working/media directory
        ingest.ingest(ingestdir, workingdir)

        #the crawl function performs a hash index of all files in the target directories
        workingdirsum = crawl.crawl(True, workingdir, jsondatadir)
        archivedirsum = crawl.crawl(False, archivedir, jsondatadir)

        #the dedupe function combines all hash indexes and analyzes the dataset for duplicates
        data_files = glob.glob(jsondatadir + '/*.json')
        #run the dedupe function
        dedupe.dedupe(data_files, duplicatedir)

        #after the dedupe function has moved duplicaes out, reindex
        workingdirsum = crawl.crawl(True, workingdir, jsondatadir)

        #the archive function pulls from the working/media directory and pools into sized volumes
        archive.archive(archivedir, jsondatadir, workingdir, mediasize)

        #validate that all files in duplicates exist elsewhere before moving to validated
        validate.validate(duplicatedir, workingdir, archivedir, validateddir)

        print("Daily Job Completed Successfully")
Beispiel #8
0
    def process(self, filepath):
        # ignore hidden files (e.g., .gitignore)
        if not os.path.basename(filepath)[0] == '.':
            try:
                processed = self.files[filepath]
            except KeyError:
                log.info(
                    'new file in dropbox %s created %s', filepath,
                    datetime.datetime.fromtimestamp(
                        os.path.getctime(filepath)))
                processed = False

            if not processed and not os.path.basename(
                    filepath) == '.gitignore':
                try:
                    _id = ingest.ingest(filepath)

                    # move file to ingested directory
                    dest = add_id(
                        _id,
                        os.path.join(app.ingested_path,
                                     os.path.basename(filepath)))
                    log.info('moving ingested file from %s to %s', filepath,
                             dest)
                    shutil.move(filepath, dest)

                except Exception as e:
                    log.warn('failed to ingest %s', filepath)
                    log.warn(e)

                    _id = save_file_metadata(filepath, status='error')

                    # move file to failed directory
                    dest = add_id(
                        _id,
                        os.path.join(app.failed_path,
                                     os.path.basename(filepath)))
                    log.info('moving failed file from %s to %s', filepath,
                             dest)
                    shutil.move(filepath, dest)

                processed = True

                self.files[filepath] = processed
Beispiel #9
0
def testNoTitle():
    import ingest

    if os.path.exists(TEST_OUTPUT):
        shutil.rmtree(TEST_OUTPUT)

    if not os.path.exists(TEST_INPUT):
        os.makedirs(TEST_INPUT)

    filename = os.path.join(TEST_INPUT, 'no-title.md')
    fh = open(filename, 'w')
    fh.write(noTitle)
    fh.close()

    loc = ingest.ingest(filename, TEST_OUTPUT)

    notesFolders = [os.path.join(loc, f) for f in os.listdir(loc)]
    assert len(notesFolders) == 1

    with open(os.path.join(notesFolders[0], 'note.md')) as fh:
            content = fh.read()
            assert 'jd9d09j1290js1902js129nvsvns' in content
Beispiel #10
0
def testPreSection():
    import ingest

    if os.path.exists(TEST_OUTPUT):
        shutil.rmtree(TEST_OUTPUT)

    if not os.path.exists(TEST_INPUT):
        os.makedirs(TEST_INPUT)

    filename = os.path.join(TEST_INPUT, 'pre-section.md')
    fh = open(filename, 'w')
    fh.write(preSection)
    fh.close()

    loc = ingest.ingest(filename, TEST_OUTPUT)

    notesFolders = [os.path.join(loc, f) for f in os.listdir(loc)]
    assert len(notesFolders) == 1

    with open(os.path.join(notesFolders[0], 'note.md')) as fh:
            content = fh.read()
            assert 'f48fh309dj0913dj9j190dj029' in content
Beispiel #11
0
def testIngest():
    import ingest

    if os.path.exists(TEST_OUTPUT):
        shutil.rmtree(TEST_OUTPUT)

    if not os.path.exists(TEST_INPUT):
        os.makedirs(TEST_INPUT)

    filename = os.path.join(TEST_INPUT, 'ingest.md')
    fh = open(filename, 'w')
    fh.write(simulatedIngestDotMd)
    fh.close()

    loc = ingest.ingest(filename, TEST_OUTPUT)

    notesFolders = [f.split('_')[0] for f in os.listdir(loc)]
    print(notesFolders)

    assert time1.split(' ')[0] in notesFolders
    assert time2.split(' ')[0] in notesFolders

    # When no time is set in note, use current time.
    nowTimeAsFolderStr = ingest.unixTimeAsSafeStr(time.time()).split(' ')[0] 
    assert nowTimeAsFolderStr in notesFolders

    # Make sure that note that had no time now has time in it.
    notesFolders = [os.path.join(loc, f) for f in os.listdir(loc)]
    
    filename = None
    for nf in notesFolders:
        if nowTimeAsFolderStr in nf:
            assert filename is None
            filename = os.path.join(nf, 'note.md')
    
    with open(filename, 'r') as fh:
        content = fh.read()
        assert 'time::' in content
Beispiel #12
0
def testPoundIsFirst():
    import ingest

    if os.path.exists(TEST_OUTPUT):
        shutil.rmtree(TEST_OUTPUT)

    if not os.path.exists(TEST_INPUT):
        os.makedirs(TEST_INPUT)

    poundFirst = """# The title

    the content
    """

    filename = os.path.join(TEST_INPUT, 'ingest.md')
    fh = open(filename, 'w')
    fh.write(poundFirst)
    fh.close()

    loc = ingest.ingest(filename, TEST_OUTPUT, poundFirst.splitlines())

    notesFolders = [f.split('_')[0] for f in os.listdir(loc)]
    print(notesFolders)
Beispiel #13
0
    def process(self, filepath):
        # ignore hidden files (e.g., .gitignore)
        if not os.path.basename(filepath)[0] == ".":
            try:
                processed = self.files[filepath]
            except KeyError:
                log.info(
                    "new file in dropbox %s created %s",
                    filepath,
                    datetime.datetime.fromtimestamp(os.path.getctime(filepath)),
                )
                processed = False

            if not processed and not os.path.basename(filepath) == ".gitignore":
                try:
                    _id = ingest.ingest(filepath)

                    # move file to ingested directory
                    dest = add_id(_id, os.path.join(app.ingested_path, os.path.basename(filepath)))
                    log.info("moving ingested file from %s to %s", filepath, dest)
                    shutil.move(filepath, dest)

                except Exception as e:
                    log.warn("failed to ingest %s", filepath)
                    log.warn(e)

                    _id = save_file_metadata(filepath, status="error")

                    # move file to failed directory
                    dest = add_id(_id, os.path.join(app.failed_path, os.path.basename(filepath)))
                    log.info("moving failed file from %s to %s", filepath, dest)
                    shutil.move(filepath, dest)

                processed = True

                self.files[filepath] = processed
Beispiel #14
0
#!/usr/bin/env python3

from ingest import ingest
try:
    import cPickle as pickle
except:
    import pickle
import sys

if __name__ == '__main__':
    source = ingest(sys.argv[1])
    with open(sys.argv[2], 'wb') as f:
        pickle.dump(source, f)
else:
    with open(sys.argv[1], 'rb') as f:
        source = pickle.load(f)
Beispiel #15
0
from ingest import ingest
from itertools import chain
from pymongo import MongoClient

# functions to for handling init
init_client = lambda uri='': MongoClient(uri)

init_db = lambda client, database_name: client[database_name]

# insertion functions
insert_restaurants_by_borough = lambda db, f: list(
    map(lambda r: db[r['borough']].insert_one(r), ingest(f)))

# query functions
get_all_by_borough = lambda db, borough: list(db[borough].find())

get_all_by_zipcode = lambda db, zipcode: list(
    chain.from_iterable(
        map(lambda c: list(db[c].find({"address.zipcode": zipcode})),
            db.collection_names())))

get_all_by_zipcode_and_grade = lambda db, zipcode, grade: list(
    chain.from_iterable(
        map(
            lambda c: list(db[c].find({
                "address.zipcode": zipcode,
                "grades.0.grade": grade
            })), db.collection_names())))

get_all_by_zipcode_and_score = lambda db, zipcode, score: list(
    chain.from_iterable(
Beispiel #16
0
    for _ in range(num_args - 1):
        cur_delim_end = recv_msg.rfind(':*:', 0, last_delim_start) + 3 # Marks the index after the current delimiter
        args.insert(0, recv_msg[cur_delim_end:last_delim_start])
        last_delim_start = cur_delim_end - 3
    # raise Exception('First Delim: ', first_delim_end, 'Last delim: ', cur_delim_end - 3)
    # raise Exception(args)
    args.insert(0, recv_msg[first_delim_end:last_delim_start])

    # command = cmd_list[0]

    # This dictionary uses the command passed from the frontend to run the relevant workspace-/index-related functions
    # print(args)
    cmd_dict = {
        'create-workspace': lambda: create_workspace(name=args[0]),
        'delete-workspace': lambda: delete_workspace(guid=args[0]),
        'import-folder': lambda: ingest(path=args[0], import_type='kive', workspace_guid=args[1]),
        'import-file': lambda: ingest(path=args[0], import_type='kive', workspace_guid=args[1]),
        'import-wsb': lambda: ingest(path=args[0], import_type='wsb', workspace_guid=args[1]),
        'import-sb': lambda: ingest(path=args[0], import_type='sb', workspace_guid=args[1]),
        'delete-files': lambda: delete(json_lst=args[0], workspace_guid=args[1]),
        'update-files': lambda: update(json_lst=args[0], workspace_guid=args[1]),
        'search': lambda: search_from_strs(search_text=args[0], 
                                leg_datetime_range=args[1],
                                kive_datetime_range=args[2],
                                la_datetime_range=args[3],
                                media_text_lst=args[4],
                                fields_lst=args[5],
                                workspace_guid=args[6])
    }

    cmd_dict[command]()
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import joblib

from ingest import ingest
from labelizeTweets import labelizeTweets
from tweet_tokenize import tokenize

# Get the file into a DF for training
file = r"C:\Users\mayank.nagar\Desktop\ML\twitter_analysis\train_data\Sentiment-Analysis-Dataset\SentimentAnalysisDataset.csv"
df = ingest(file)
print("File received and processed into dataframe")

df['tokens'] = df['SentimentText'].map(tokenize)
print("Dataframe tokenization completed")

# Split the DF into training and testing
x_train, x_test, y_train, y_test = train_test_split(np.array(df.tokens),
                                                    np.array(df.Sentiment),
                                                    test_size=0.2)
x_train = labelizeTweets(x_train, 'TRAIN')
x_test = labelizeTweets(x_test, 'TEST')
print("Dataframe split into training and test completed")

corpus = [x.words for x in x_train]

print("Training TF-IDF vector")
vectorizer = TfidfVectorizer(analyzer=lambda x: x, min_df=100)
matrix = vectorizer.fit_transform(corpus)
tfidf = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_))
Beispiel #18
0
 def ingest(self):
     print("Ingesting Files")
     #the ingest function sorts and moves files by date into the working/media directory
     a = ingest.ingest(ingestdir, workingdir)