Exemple #1
0
    def test_decode_field_data_embedding(self):
        file_path_test_decode = os.path.join(
            THIS_DIR, "../../datasets/test_decode/movies_title_embedding.json")
        test_dir = os.path.join(THIS_DIR, "../../datasets/test_decode/")

        movies_ca_config = ContentAnalyzerConfig(
            content_type='Item',
            source=JSONFile(file_path_test_decode),
            id_field_name_list=['imdbID'],
            output_directory=test_dir + 'movies_embedding_')

        movies_ca_config.append_field_config(
            field_name='Title',
            field_config=FieldConfig(pipelines_list=[
                FieldRepresentationPipeline(content_technique=None),
            ]))
        ContentAnalyzer(config=movies_ca_config).fit()

        for name in os.listdir(test_dir):
            if os.path.isdir(os.path.join(test_dir, name)) \
                    and 'movies_embedding_' in str(name):
                with lzma.open(os.path.join(test_dir, name, 'tt0113497.xz'),
                               'r') as file:
                    content = pickle.load(file)

                    self.assertIsInstance(
                        content.get_field("Title").get_representation('0'),
                        EmbeddingField)
                    self.assertIsInstance(
                        content.get_field("Title").get_representation(
                            '0').value, np.ndarray)
                    break
Exemple #2
0
 def test_create_content(self):
     file_path_content_analyzer = os.path.join(
         THIS_DIR, "../../test/content_analyzer/movielens_test*")
     entity_linking_pipeline = FieldRepresentationPipeline(
         BabelPyEntityLinking())
     plot_config = FieldConfig(None)
     plot_config.append_pipeline(entity_linking_pipeline)
     content_analyzer_config = ContentAnalyzerConfig(
         'ITEM', JSONFile(file_path), ["imdbID"], "movielens_test")
     content_analyzer_config.append_field_config("Plot", plot_config)
     content_analyzer = ContentAnalyzer(content_analyzer_config)
     content_analyzer.fit()
     """
Exemple #3
0
    def test_create_content_search_index(self):
        movies_ca_config = ContentAnalyzerConfig(
            content_type='Item',
            source=JSONFile(file_path),
            id_field_name_list=['imdbID'],
            output_directory='movielens_test')

        movies_ca_config.append_field_config(
            field_name='Title',
            field_config=FieldConfig(pipelines_list=[
                FieldRepresentationPipeline(content_technique=SearchIndexing())
            ]))

        content_analyzer = ContentAnalyzer(movies_ca_config)
        content_analyzer.fit()
    def test_create_content(self):
        filepath = '../../datasets/movies_info_reduced.json'
        try:
            with open(filepath):
                pass
        except FileNotFoundError:
            filepath = 'datasets/movies_info_reduced.json'

        entity_linking_pipeline = FieldRepresentationPipeline(
            BabelPyEntityLinking())
        plot_config = FieldConfig(None)
        plot_config.append_pipeline(entity_linking_pipeline)
        content_analyzer_config = ContentAnalyzerConfig(
            'ITEM', JSONFile(filepath), ["imdbID"], "movielens_test")
        content_analyzer_config.append_field_config("Plot", plot_config)
        content_analyzer = ContentAnalyzer(content_analyzer_config)
        content_analyzer.fit()
Exemple #5
0
    def test_create_content_embedding(self):
        movies_ca_config = ContentAnalyzerConfig(
            content_type='Item',
            source=JSONFile(file_path),
            id_field_name_list=['imdbID'],
            output_directory="movielens_test",
        )

        movies_ca_config.append_field_config(
            field_name='Title',
            field_config=FieldConfig(pipelines_list=[
                FieldRepresentationPipeline(
                    preprocessor_list=[
                        NLTK(lemmatization=True, stopwords_removal=True)
                    ],
                    content_technique=EmbeddingTechnique(
                        combining_technique=Centroid(),
                        embedding_source=GensimDownloader(
                            name='glove-twitter-25'),
                        granularity='doc'))
            ]))

        content_analyzer = ContentAnalyzer(movies_ca_config)
        content_analyzer.fit()
from orange_cb_recsys.content_analyzer import ContentAnalyzerConfig, ContentAnalyzer
from orange_cb_recsys.content_analyzer.exogenous_properties_retrieval import DBPediaMappingTechnique
from orange_cb_recsys.content_analyzer.raw_information_source import JSONFile

movies_filename = '../../../datasets/examples/movies_info_reduced.json'

output_dir_movies = '../../../contents/examples/ex_3/movies_'

movies_ca_config = ContentAnalyzerConfig(content_type='Item',
                                         source=JSONFile(movies_filename),
                                         id_field_name_list=['imdbID'],
                                         output_directory=output_dir_movies)

movies_ca_config.append_exogenous_properties_retrieval(
    DBPediaMappingTechnique(entity_type='Film', lang='EN',
                            label_field='Title'))

content_analyzer = ContentAnalyzer(movies_ca_config).fit()
Exemple #7
0
lucene.initVM(vmargs=['-Djava.awt.headless=true'])

api_key = ''

movies_filename = '../../../datasets/examples/movies_info.json'

movies_output_dir = '../../../contents/examples/ex_2/movies_'

users_filename = '../../../datasets/examples/users_70.dat'

users_output_dir = '../../../contents/examples/ex_2/users_'

movies_ca_config = ContentAnalyzerConfig(
    content_type='Item',
    source=JSONFile(movies_filename),
    id_field_name_list=['imdbID'],
    output_directory=movies_output_dir,
)

movies_ca_config.append_field_config(
    field_name='Title',
    field_config=FieldConfig(pipelines_list=[
        FieldRepresentationPipeline(content_technique=LuceneTfIdf())
    ]))

movies_ca_config.append_field_config(
    field_name='Year',
    field_config=FieldConfig(pipelines_list=[
        FieldRepresentationPipeline(content_technique=LuceneTfIdf())
    ]))
Exemple #8
0
from orange_cb_recsys.recsys.graphs.full_graphs import NXFullGraph
from orange_cb_recsys.recsys.ranking_algorithms import NXPageRank

from orange_cb_recsys.evaluation.graph_metrics import nx_degree_centrality, nx_dispersion

from orange_cb_recsys.utils.feature_selection import NXFSPageRank

movies_filename = '/home/Mattia/Documents/ml-1m/movies.dat'
user_filename = '/home/Mattia/Documents/ml-1m/users.dat'
ratings_filename = '/home/Mattia/Documents/ml-1m/ratings.dat'

output_dir = '../../contents/test_1m_'

movies_ca_config = ContentAnalyzerConfig(content_type='Item',
                                         source=DATFile(movies_filename),
                                         id_field_name_list=['0'],
                                         output_directory=output_dir)

movies_ca_config.append_exogenous_properties_retrieval(
    DBPediaMappingTechnique(entity_type='Film', lang='EN', label_field='1'))

content_analyzer = ContentAnalyzer(movies_ca_config).fit()

users_ca_config = ContentAnalyzerConfig(content_type='User',
                                        source=DATFile(user_filename),
                                        id_field_name_list=['0'],
                                        output_directory=output_dir)

users_ca_config.append_exogenous_properties_retrieval(PropertiesFromDataset())

content_analyzer.set_config(users_ca_config).fit()
import lucene

lucene.initVM(vmargs=['-Djava.awt.headless=true'])

movies_filename = '../../../datasets/examples/movies_info.json'

movies_output_dir = '../../../contents/examples/ex_1/movies_'

users_filename = '../../../datasets/examples/users_70.dat'

users_output_dir = '../../../contents/examples/ex_1/users_'

movies_ca_config = ContentAnalyzerConfig(
    content_type='Item',
    source=JSONFile(movies_filename),
    id_field_name_list=['imdbID'],
    output_directory=movies_output_dir,
)

movies_ca_config.append_field_config(
    field_name='Title',
    field_config=FieldConfig(pipelines_list=[
        FieldRepresentationPipeline(content_technique=LuceneTfIdf())
    ]))

movies_ca_config.append_field_config(
    field_name='Year',
    field_config=FieldConfig(pipelines_list=[
        FieldRepresentationPipeline(content_technique=LuceneTfIdf())
    ]))
lucene.initVM(vmargs=['-Djava.awt.headless=true'])

api_key = ''

movies_filename = '../../../datasets/examples/movies_info.json'

movies_output_dir = '../../../contents/examples/ex_2/movies_'

users_filename = '../../../datasets/examples/users_70.dat'

users_output_dir = '../../../contents/examples/ex_2/users_'

movies_ca_config = ContentAnalyzerConfig(
    content_type='Item',
    source=JSONFile(movies_filename),
    id_field_name_list=['imdbID'],
    output_directory=movies_output_dir,

)

UserAnalyzerConfig(
    source = JSONFile(users_filename),
    id = '0',
    output_dir = users_output_dir,
)

ItemAnalyzerConfig(
    source = JSONFile(items_filename),
    id = 'imdbID',
    output_dir = items_output_dir,
)
Exemple #11
0
from orange_cb_recsys.content_analyzer import ContentAnalyzerConfig, FieldRepresentationPipeline, FieldConfig, \
    ContentAnalyzer
from orange_cb_recsys.content_analyzer.field_content_production_techniques import LuceneTfIdf, BabelPyEntityLinking, \
    EmbeddingTechnique, Centroid, Wikipedia2VecDownloader, GensimDownloader
from orange_cb_recsys.content_analyzer.field_content_production_techniques.tf_idf import SkLearnTfIdf
from orange_cb_recsys.content_analyzer.information_processor import NLTK
from orange_cb_recsys.content_analyzer.raw_information_source import JSONFile

movies_dataset = '../../../datasets/movie_info_reduced.json'
users_dataset = '../../../datasets/users_info_.json'

movies_dir = 'movies_dir'
users_dir = 'Users_Example'

users_config = ContentAnalyzerConfig(
    content_type='User',
    source=JSONFile(users_dataset),
    id_field_name_list=['user_id'],
    output_directory=users_dir,
)

ContentAnalyzer(config=users_config).fit()