コード例 #1
0
def test_read_mongo(mocker):
    class DBStub():
        def aggregate(self, docs):
            return []

    collection_name = 'ACollection'
    db = {collection_name: DBStub()}
    mock = mocker.spy(db[collection_name], 'aggregate')
    pdm.read_mongo(collection_name, [], db)
    mock.assert_called_with([])
コード例 #2
0
def test_read_mongo_chunksize(mocker):
    class DBStub():
        def aggregate(self, docs, **kwargs):
            return []

    collection_name = 'ACollection'
    batch_size = 2
    db = {collection_name: DBStub()}
    mock = mocker.spy(db[collection_name], 'aggregate')
    pdm.read_mongo(collection_name, [], db, chunksize=batch_size)
    mock.assert_called_with([], batchSize=batch_size)
コード例 #3
0
def test_read_mongo_params(mocker):
    collection_name = 'ACollection'

    class CollectionStub:
        def aggregate(self, docs, **kwargs):
            pass

    collection_mock = mocker.Mock(CollectionStub)
    collection_mock.aggregate.return_value = []
    db = {collection_name: collection_mock}
    pdm.read_mongo(collection_name, [], db, extra={'allowDiskUse': True})
    collection_mock.aggregate.assert_called_with([], allowDiskUse=True)
コード例 #4
0
def test_read_mongo_params_batch_size_and_chunksize_raises_value_error(mocker):
    collection_name = 'ACollection'

    class CollectionStub:
        def aggregate(self, docs, **kwargs):
            pass

    collection_mock = mocker.Mock(CollectionStub)
    collection_mock.aggregate.return_value = []
    db = {collection_name: collection_mock}
    with pytest.raises(ValueError):
        pdm.read_mongo(collection_name, [],
                       db,
                       chunksize=30,
                       extra={'batchSize': 20})
コード例 #5
0
ファイル: app.py プロジェクト: cittavaras/analisis-datos-back
def mas_repetida():
    palabra = request.args.get('palabra', type=str)
    df = pdm.read_mongo("prepared_tweets", [], db)

    if palabra == 'todas':
        try:
            result = Counter(" ".join(
                df["clean_tweets"]).split()).most_common(50)
            # print(result)

            result_df = pd.DataFrame(result, columns=['Palabra', 'Frequencia'
                                                      ]).set_index('Palabra')
            print(result_df)
            result_df = result_df.to_json(orient='columns')

            return result_df

        except ValueError as e:
            pass
    else:
        try:
            return palabra

        except ValueError as e:
            pass
コード例 #6
0
ファイル: corrections.py プロジェクト: jmosbacher/strax
    def read(self, correction):
        """Smart logic to read corrections,
        :param correction: pandas.DataFrame object name in the DB (str type).
        :return: DataFrame as read from the corrections database with time
        index or None if an empty DataFrame is read from the database
        """
        df = pdm.read_mongo(correction, [], self.client[self.database_name])

        return self.sort_by_index(df)
コード例 #7
0
ファイル: corrections.py プロジェクト: jmosbacher/strax
    def read_at(self, correction, when, limit=1):
        """Smart logic to read corrections at given time (index), i.e by datetime index
        :param correction: pandas.DataFrame object name in the DB (str type).
        :param when: when, datetime to read the corrections, e.g. datetime(2020, 8, 12, 21, 4, 32, 7, tzinfo=pytz.utc)
        :param limit: how many indexes after and before when, i.e. limit=1 will return 1 index before and 1 after 
        :return: DataFrame as read from the corrections database with time
        index or None if an empty DataFrame is read from the database
        """
        before_df = pdm.read_mongo(correction,
                                   self.before_date_query(when, limit),
                                   self.client[self.database_name])
        after_df = pdm.read_mongo(correction,
                                  self.after_date_query(when, limit),
                                  self.client[self.database_name])

        df = pd.concat([before_df, after_df])

        return self.sort_by_index(df)
コード例 #8
0
    def connect_mongo(self):
        try:
            client = MongoClient(MONGO_HOST)

            # db = client.pruebadb
            db = client.climateinfo

            # Store info from "filtered_stream" collection into pandas dataframe
            df = pdm.read_mongo("prepared_tweets", [], db)
            print(df.dtypes)
            return df

        except Exception as e:
            print(e)
コード例 #9
0
    def connect_mongo(self):
        try:
            client = MongoClient(MONGO_HOST)

            db = client.climateinfo

            # Store info from "filtered_stream" collection into pandas dataframe
            df = pdm.read_mongo("filtered_stream", [], db)

            print(df.head())
            return df

        except Exception as e:
            print(e)
コード例 #10
0
def test_read_mongo_index_col(mocker):
    class DBStub():
        def aggregate(self, docs, **kwargs):
            return [{
                't': '2020-01-01T00:00:00.000Z',
                'v': 20
            }, {
                't': '2020-01-01T01:00:00.000Z',
                'v': 15
            }]

    collection_name = 'ACollection'
    db = {collection_name: DBStub()}
    df = pdm.read_mongo(collection_name, [], db, index_col='t')
    assert df.index[0] == '2020-01-01T00:00:00.000Z'
    assert df.v[0] == 20
コード例 #11
0
    def connect_mongo(self):

        try:
            client = MongoClient(MONGO_HOST)

            # db = client.pruebadb
            db = client.climateinfo

            # Store info from "twitter_search" collection into pandas dataframe
            df = pdm.read_mongo("longfiltertweets", [], db)

            print(df.head())
            return df

        except Error as e:
            print(e)
コード例 #12
0
ファイル: corrections.py プロジェクト: skazama/strax
    def read(self, correction):
        """Smart logic to read corrections,
        :param correction: pandas.DataFrame object name in the DB (str type).
        :return: DataFrame as read from the corrections database with time
        index or None if an empty DataFrame is read from the database
        """
        df = pdm.read_mongo(correction, [], self.database)

        # No data found
        if df.size == 0:
            return None
        # Delete internal Mongo identifier
        del df['_id']

        df['time'] = pd.to_datetime(df['time'], utc=True)

        return df.set_index('time')
コード例 #13
0
def test_read_mongo_db_str(mocker):
    class CollectionStub():
        def aggregate(self, query, **kwargs):
            return [{
                't': '2020-01-01T00:00:00.000Z',
                'v': 20
            }, {
                't': '2020-01-01T01:00:00.000Z',
                'v': 15
            }]

    class DBStub:
        def __getitem__(self, item):
            return CollectionStub()

    mock = mocker.patch("pymongo.database.Database")
    mock.return_value = DBStub()
    collection_name = 'ACollection'

    db_uri = "mongodb://localhost:27017/pd-mongo-sample-db"
    df = pdm.read_mongo(collection_name, [], db_uri)
    assert df.index[0] == 0
    assert df.values[0][0] == '2020-01-01T00:00:00.000Z'
    assert not df.values.size == 2
コード例 #14
0
ファイル: TF-IDF.py プロジェクト: JackyC415/cmpe295-project
import pandas as pd
import json
import operator
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
import pdmongo as pdm
from nltk.corpus import stopwords 
nltk.download('stopwords')

jobsFile = pdm.read_mongo("jobs", [], "mongodb://localhost:27017/cmpe295")
resumesFile = pd.read_csv("resumes-data.csv")
stopset = set(stopwords.words('english'))

tfidf_vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0, stop_words=stopset)
job_skills_matrix = tfidf_vectorizer.fit_transform(jobsFile['Skills'].astype('U'))
resume_skills_matrix = tfidf_vectorizer.transform(resumesFile['Skills'])

matchingJobsList = []
matchingRate = 0.0
for idx in range(len(jobsFile)):
	skills_similarity_score = cosine_similarity(job_skills_matrix[idx],resume_skills_matrix[0])
	
	if skills_similarity_score > matchingRate:
		matchingJobsList.append({ "jid": idx, "score": skills_similarity_score[0][0] })

topTenMatchingJobs = sorted(matchingJobsList, key=operator.itemgetter('score'), reverse=True)[:50]
print(json.dumps(topTenMatchingJobs))
コード例 #15
0
ファイル: app.py プロジェクト: cittavaras/analisis-datos-back
def resumen_valoracion():
    palabra = request.args.get('palabra', type=str)

    df = pdm.read_mongo("prepared_tweets", [], db)
    # print(df.head())

    if palabra == 'todas':
        try:
            pos_tweets = [
                tweet for index, tweet in enumerate(df["clean_tweets"])
                if re.search('Positivo', df['valoracion_manual'][index])
            ]
            neg_tweets = [
                tweet for index, tweet in enumerate(df["clean_tweets"])
                if re.search('Negativo', df['valoracion_manual'][index])
            ]
            neu_tweets = [
                tweet for index, tweet in enumerate(df["clean_tweets"])
                if re.search('Neutro', df['valoracion_manual'][index])
            ]

            valoraciones = {
                "palabra": palabra,
                "total_tweets": len(df['clean_tweets']),
                "can_pos": len(pos_tweets),
                "can_neg": len(neg_tweets),
                "can_neu": len(neu_tweets)
            }

            # print(valoraciones)
            return jsonify(valoraciones)

        except ValueError as e:
            pass
    else:
        try:
            pos_tweets = [
                tweet for index, tweet in enumerate(df["clean_tweets"])
                if re.search('Positivo', df['valoracion_manual'][index])
            ]
            neg_tweets = [
                tweet for index, tweet in enumerate(df["clean_tweets"])
                if re.search('Negativo', df['valoracion_manual'][index])
            ]
            neu_tweets = [
                tweet for index, tweet in enumerate(df["clean_tweets"])
                if re.search('Neutro', df['valoracion_manual'][index])
            ]

            valoraciones = {
                "palabra": palabra,
                "total_tweets": len(df['clean_tweets']),
                "can_pos": 0,
                "can_neg": 0,
                "can_neu": 0
            }

            # print(valoraciones)
            return jsonify(valoraciones)

        except ValueError as e:
            pass